diff --git a/etc/example.conf b/etc/example.conf index b118ab9c..356944ef 100644 --- a/etc/example.conf +++ b/etc/example.conf @@ -103,6 +103,13 @@ archive_only_mydomains=0 ; minimum word length in mail body to index min_word_len=1 +; whether to enable CJK (=Chinese, Japanese, and Korean) "characters". +; the text piler can see with CJK languages may have extremely long +; sequences without any whitespace. To prevent the parser to drop +; these very long sequences, enable (1) this feature. By default it's +; disabled (0). +enable_cjk=0 + ; if piler detects this line in the mail header, then it will assume ; the message is a spam. You should include your own antispam solution's ; specific line. diff --git a/src/cfg.c b/src/cfg.c index 04fe0608..02878e13 100644 --- a/src/cfg.c +++ b/src/cfg.c @@ -68,6 +68,7 @@ struct _parse_rule config_parse_rules[] = { "clamd_socket", "string", (void*) string_parser, offsetof(struct __config, clamd_socket), CLAMD_SOCKET, MAXVAL-1}, { "debug", "integer", (void*) int_parser, offsetof(struct __config, debug), "0", sizeof(int)}, { "default_retention_days", "integer", (void*) int_parser, offsetof(struct __config, default_retention_days), "2557", sizeof(int)}, + { "enable_cjk", "integer", (void*) int_parser, offsetof(struct __config, enable_cjk), "0", sizeof(int)}, { "encrypt_messages", "integer", (void*) int_parser, offsetof(struct __config, encrypt_messages), "1", sizeof(int)}, { "extra_to_field", "string", (void*) string_parser, offsetof(struct __config, extra_to_field), "", MAXVAL-1}, { "hostid", "string", (void*) string_parser, offsetof(struct __config, hostid), HOSTID, MAXVAL-1}, diff --git a/src/cfg.h b/src/cfg.h index 2a989e16..537f744e 100644 --- a/src/cfg.h +++ b/src/cfg.h @@ -84,6 +84,8 @@ struct __config { int tweak_sent_time_offset; + int enable_cjk; + int debug; }; diff --git a/src/config.h b/src/config.h index 31b1009c..65ea457e 100644 --- a/src/config.h +++ b/src/config.h @@ -14,7 +14,7 @@ #define VERSION "0.1.25-master-branch" -#define BUILD 851 +#define BUILD 852 #define HOSTID "mailarchiver" diff --git a/src/parser.c b/src/parser.c index c7529cbf..aad91b1e 100644 --- a/src/parser.c +++ b/src/parser.c @@ -604,7 +604,7 @@ int parse_line(char *buf, struct _state *state, struct session_data *sdata, int if(strncasecmp(puf, "http://", 7) == 0 || strncasecmp(puf, "https://", 8) == 0) fixURL(puf); - if(state->is_header == 0 && strncmp(puf, "__URL__", 7) && (puf[0] == ' ' || strlen(puf) > MAX_WORD_LEN || isHexNumber(puf)) ) continue; + if(state->is_header == 0 && strncmp(puf, "__URL__", 7) && (puf[0] == ' ' || (strlen(puf) > MAX_WORD_LEN && cfg->enable_cjk == 0) || isHexNumber(puf)) ) continue; len = strlen(puf);