tweaked the parser to support cjk languages

This commit is contained in:
SJ 2013-11-26 11:43:21 +01:00
parent c0301ceaca
commit 1e44042b82
5 changed files with 12 additions and 2 deletions

View File

@ -103,6 +103,13 @@ archive_only_mydomains=0
; minimum word length in mail body to index ; minimum word length in mail body to index
min_word_len=1 min_word_len=1
; whether to enable CJK (=Chinese, Japanese, and Korean) "characters".
; the text piler can see with CJK languages may have extremely long
; sequences without any whitespace. To prevent the parser to drop
; these very long sequences, enable (1) this feature. By default it's
; disabled (0).
enable_cjk=0
; if piler detects this line in the mail header, then it will assume ; if piler detects this line in the mail header, then it will assume
; the message is a spam. You should include your own antispam solution's ; the message is a spam. You should include your own antispam solution's
; specific line. ; specific line.

View File

@ -68,6 +68,7 @@ struct _parse_rule config_parse_rules[] =
{ "clamd_socket", "string", (void*) string_parser, offsetof(struct __config, clamd_socket), CLAMD_SOCKET, MAXVAL-1}, { "clamd_socket", "string", (void*) string_parser, offsetof(struct __config, clamd_socket), CLAMD_SOCKET, MAXVAL-1},
{ "debug", "integer", (void*) int_parser, offsetof(struct __config, debug), "0", sizeof(int)}, { "debug", "integer", (void*) int_parser, offsetof(struct __config, debug), "0", sizeof(int)},
{ "default_retention_days", "integer", (void*) int_parser, offsetof(struct __config, default_retention_days), "2557", sizeof(int)}, { "default_retention_days", "integer", (void*) int_parser, offsetof(struct __config, default_retention_days), "2557", sizeof(int)},
{ "enable_cjk", "integer", (void*) int_parser, offsetof(struct __config, enable_cjk), "0", sizeof(int)},
{ "encrypt_messages", "integer", (void*) int_parser, offsetof(struct __config, encrypt_messages), "1", sizeof(int)}, { "encrypt_messages", "integer", (void*) int_parser, offsetof(struct __config, encrypt_messages), "1", sizeof(int)},
{ "extra_to_field", "string", (void*) string_parser, offsetof(struct __config, extra_to_field), "", MAXVAL-1}, { "extra_to_field", "string", (void*) string_parser, offsetof(struct __config, extra_to_field), "", MAXVAL-1},
{ "hostid", "string", (void*) string_parser, offsetof(struct __config, hostid), HOSTID, MAXVAL-1}, { "hostid", "string", (void*) string_parser, offsetof(struct __config, hostid), HOSTID, MAXVAL-1},

View File

@ -84,6 +84,8 @@ struct __config {
int tweak_sent_time_offset; int tweak_sent_time_offset;
int enable_cjk;
int debug; int debug;
}; };

View File

@ -14,7 +14,7 @@
#define VERSION "0.1.25-master-branch" #define VERSION "0.1.25-master-branch"
#define BUILD 851 #define BUILD 852
#define HOSTID "mailarchiver" #define HOSTID "mailarchiver"

View File

@ -604,7 +604,7 @@ int parse_line(char *buf, struct _state *state, struct session_data *sdata, int
if(strncasecmp(puf, "http://", 7) == 0 || strncasecmp(puf, "https://", 8) == 0) fixURL(puf); if(strncasecmp(puf, "http://", 7) == 0 || strncasecmp(puf, "https://", 8) == 0) fixURL(puf);
if(state->is_header == 0 && strncmp(puf, "__URL__", 7) && (puf[0] == ' ' || strlen(puf) > MAX_WORD_LEN || isHexNumber(puf)) ) continue; if(state->is_header == 0 && strncmp(puf, "__URL__", 7) && (puf[0] == ' ' || (strlen(puf) > MAX_WORD_LEN && cfg->enable_cjk == 0) || isHexNumber(puf)) ) continue;
len = strlen(puf); len = strlen(puf);