From 1192fc3218681177114eaeeb5cf667967ad75df4 Mon Sep 17 00:00:00 2001 From: Janos SUTO Date: Wed, 8 Nov 2017 11:50:28 +0100 Subject: [PATCH] src: decoder and parser fix Signed-off-by: Janos SUTO --- src/decoder.c | 2 + src/parser_utils.c | 136 +++++++++++++++++++------------- unit_tests/check_parser_utils.c | 67 +++++++--------- unit_tests/test.h | 24 ++++++ 4 files changed, 134 insertions(+), 95 deletions(-) create mode 100644 unit_tests/test.h diff --git a/src/decoder.c b/src/decoder.c index 581655c0..5317808d 100644 --- a/src/decoder.c +++ b/src/decoder.c @@ -78,6 +78,8 @@ inline void utf8_encode_char(unsigned char c, unsigned char *buf, int buflen, in * U+0000..U+007F 00..7F * U+0080..U+07FF C2..DF 80..BF * U+0800..U+0FFF E0 A0..BF 80..BF + * + * FIXME: See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#G7404 for valid sequences */ if(c <= 0x7F){ diff --git a/src/parser_utils.c b/src/parser_utils.c index 51476691..b09d6500 100644 --- a/src/parser_utils.c +++ b/src/parser_utils.c @@ -196,6 +196,7 @@ time_t parse_date_header(char *datestr){ else if(strncasecmp(s, "Sat", 3) == 0) tm.tm_wday = 6; else if(strncasecmp(s, "Sun", 3) == 0) tm.tm_wday = 0; + if(len <= 2 && tm.tm_mday == 0){ tm.tm_mday = atoi(s); continue; } if(len <= 2 && tm.tm_mon == -1){ tm.tm_mon = atoi(s) - 1; continue; } @@ -313,17 +314,19 @@ int extract_boundary(char *p, struct parser_state *state){ void fixupEncodedHeaderLine(char *buf, int buflen){ - char *sb, *sq, *p, *q, *r, *s, *e, *start, *end; + char *p, *q, *r, *s, *e, *end; /* * I thought SMALLBUFSIZE would be enough for v, encoding and tmpbuf(2*), * but then I saw a 6-7000 byte long subject line, so I've switched to MAXBUFSIZE */ - char v[MAXBUFSIZE], puf[MAXBUFSIZE], encoding[MAXBUFSIZE], tmpbuf[2*MAXBUFSIZE]; + char v[MAXBUFSIZE], u[MAXBUFSIZE], puf[MAXBUFSIZE], encoding[MAXBUFSIZE], tmpbuf[2*MAXBUFSIZE]; int need_encoding, ret; if(buflen < 5) return; memset(puf, 0, sizeof(puf)); + memset(encoding, 0, sizeof(encoding)); + q = buf; @@ -332,69 +335,89 @@ void fixupEncodedHeaderLine(char *buf, int buflen){ p = v; - memset(encoding, 0, sizeof(encoding)); - do { - start = strstr(p, "=?"); - if(start){ - *start = '\0'; - if(strlen(p) > 0){ - strncat(puf, p, sizeof(puf)-strlen(puf)-1); + memset(u, 0, sizeof(u)); + + /* + * We can't use split_str(p, "=?", ...) it will fail with the following pattern + * =?UTF-8?B?SG9neWFuIMOtcmp1bmsgcGFuYXN6bGV2ZWxldD8=?= + * + * Also the below patter requires special care: + * =?gb2312?B??==?gb2312?Q??= + */ + + r = strstr(p, "=?"); + if(r){ + p = r + 2; + end = strstr(p, "?="); + if(end){ + *end = '\0'; } - start++; + snprintf(u, sizeof(u)-1, "%s", p); - e = strchr(start+2, '?'); - if(e){ - *e = '\0'; - snprintf(encoding, sizeof(encoding)-1, "%s", start+1); - *e = '?'; - } - - s = NULL; - sb = strcasestr(start, "?B?"); if(sb) s = sb; - sq = strcasestr(start, "?Q?"); if(sq) s = sq; - - if(s){ - end = strstr(s+3, "?="); - if(end){ - *end = '\0'; - - if(sb){ decodeBase64(s+3); } - if(sq){ decodeQP(s+3); r = s + 3; for(; *r; r++){ if(*r == '_') *r = ' '; } } - - /* encode everything if it's not utf-8 encoded */ - - need_encoding = 0; - ret = ERR; - - if(strlen(encoding) > 2 && strcasecmp(encoding, "utf-8")){ - need_encoding = 1; - ret = utf8_encode(s+3, strlen(s+3), &tmpbuf[0], sizeof(tmpbuf), encoding); - } - - if(need_encoding == 1 && ret == OK) - strncat(puf, tmpbuf, sizeof(puf)-strlen(puf)-1); - else - strncat(puf, s+3, sizeof(puf)-strlen(puf)-1); - - p = end + 2; - } - } - else { - strncat(puf, start, sizeof(puf)-strlen(puf)-1); - - break; + if(end) { + p = end + 2; } } else { - strncat(puf, p, sizeof(puf)-strlen(puf)-1); - break; + snprintf(u, sizeof(u)-1, "%s", p); + p = NULL; + } + + if(u[0] == 0) continue; + + memset(encoding, 0, sizeof(encoding)); + + // Check if it's either ?B? or ?Q? encoding ... + s = strcasestr(u, "?B?"); + if(s){ + decodeBase64(s+3); + } + else { + s = strcasestr(u, "?Q?"); + if(s){ + decodeQP(s+3); + r = s + 3; + for(; *r; r++){ + if(*r == '_') *r = ' '; + } + } + } + + // ... if it is, then get the encoding + if(s){ + e = strchr(u, '?'); + if(e){ + *e = '\0'; + snprintf(encoding, sizeof(encoding)-1, "%s", u); + *e = '?'; + + need_encoding = 0; + ret = ERR; + + if(encoding[0] && strcasecmp(encoding, "utf-8")){ + need_encoding = 1; + ret = utf8_encode(s+3, strlen(s+3), &tmpbuf[0], sizeof(tmpbuf), encoding); + } + + if(need_encoding == 1 && ret == OK) + strncat(puf, tmpbuf, sizeof(puf)-strlen(puf)-1); + else + strncat(puf, s+3, sizeof(puf)-strlen(puf)-1); + } + else { + memset(encoding, 0, sizeof(encoding)); + strncat(puf, u, sizeof(puf)-strlen(puf)-1); + } + } + else { + strncat(puf, u, sizeof(puf)-strlen(puf)-1); } } while(p); - if(q) strncat(puf, " ", sizeof(puf)-strlen(puf)-1); + if(q && encoding[0] == 0) strncat(puf, " ", sizeof(puf)-strlen(puf)-1); } while(q); @@ -599,6 +622,7 @@ void translateLine(unsigned char *p, struct parser_state *state){ prev = *p; } + if(state->message_state == MSG_SUBJECT && (*p == '%' || *p == '_' || *p == '&') ){ continue; } if(state->message_state == MSG_CONTENT_TYPE && *p == '_' ){ continue; } @@ -658,8 +682,7 @@ int does_it_seem_like_an_email_address(char *email){ */ void reassembleToken(char *p){ - unsigned int i; - int k=0; + unsigned int i, k=0; for(i=0; i -#include -#include -#include -#include -#include "../src/piler.h" +#include "test.h" struct date_test { @@ -29,9 +24,9 @@ struct str_pair { static void test_parse_date_header(){ unsigned int i; - int dst_fix = 0; - time_t t = time(NULL); - struct tm lt = {0}; + //time_t t = time(NULL); + //int dst_fix = 0; + //struct tm lt = {0}; struct config cfg; struct date_test date_test[] = { {"Date: Mon, 02 Nov 2015 09:39:31 -0000", 1446457171}, @@ -53,22 +48,22 @@ static void test_parse_date_header(){ setlocale(LC_MESSAGES, cfg.locale); setlocale(LC_CTYPE, cfg.locale); - localtime_r(&t, <); + /*localtime_r(&t, <); if(lt.tm_isdst == 1){ printf("DST is on\n"); dst_fix = 3600; } else { printf("DST is off\n"); - } + }*/ + + TEST_HEADER(); for(i=0; i=20www.xxxxx.com=20new=20virtual=20?=", " www.xxxxx.com new virtual "}, {"Re: FW: =?ISO-8859-2?Q?Sopron-Gy=F5r_optikai_sz=E1l_probl=E9?=", "Re: FW: Sopron-Győr optikai szál problé"}, - {"=?UTF-8?Q?Megh=C3=ADv=C3=B3=20a=20Pulzus=20felm=C3=A9r=C3=A9sre=20/=20Inv?= =?UTF-8?Q?itation=20to=20the=20Pulse=20Survey?=", "Meghívó a Pulzus felmérésre / Inv itation to the Pulse Survey"}, + {"=?UTF-8?Q?Megh=C3=ADv=C3=B3=20a=20Pulzus=20felm=C3=A9r=C3=A9sre=20/=20Inv?= =?UTF-8?Q?itation=20to=20the=20Pulse=20Survey?=", "Meghívó a Pulzus felmérésre / Invitation to the Pulse Survey"}, {"=?iso-8859-2?Q?vhost_l=E9trehoz=E1sa?=", "vhost létrehozása"}, {"Re: MAIL =?UTF-8?B?U1pPTEfDgUxUQVTDgVMgSElCQSAgIEdUUzogOTE1NDUyMQ==?=", "Re: MAIL SZOLGÁLTATÁS HIBA GTS: 9154521"}, {"[spam???] Better Sex. Better Body. Better Life.", "[spam???] Better Sex. Better Body. Better Life."}, @@ -157,20 +153,20 @@ static void test_fixupEncodedHeaderLine(){ {"Subject: =?UTF-8?Q?Experience=20a=20Crazy=20Reward=20Delivered=20to=20you?=", "Subject: Experience a Crazy Reward Delivered to you"}, {"Subject: =?windows-1251?B?ze7i7uPu5O3o5SDv7uTg8OroIOTr/yDC4Pjo?=", "Subject: Новогодние подарки для Ваши"}, {"Subject: =?utf-8?Q?Divatos,_=C3=BCde_sz=C3=ADneinek_k=C3=B6sz=C3=B6nhet=C5=91en_el?=", "Subject: Divatos, üde színeinek köszönhetően el"}, + {"=?gb2312?B?yc/Gz76pIC0gw7/fTMir0bKy6YjzuOYgKDIwMTcxMDMwLTMxKSBHQlcgUG9k?==?gb2312?Q?ium_&_Basement.docx?=", "上葡京 - 每週全巡查報告 (20171030-31) GBW Podium & Basement.docx"}, }; + TEST_HEADER(); for(i=0; i +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../src/piler.h" + + +#define ASSERT(expr, value) if (!(expr)) { printf("assert failed: '%s'\n", value); abort(); } else { printf("."); } +#define TEST_HEADER() printf("%s() ", __func__); +#define TEST_FOOTER() printf(" OK\n"); +