parser fix to support some emojis in subject line

Signed-off-by: Janos SUTO <sj@acts.hu>
This commit is contained in:
Janos SUTO 2018-01-11 09:28:45 +01:00
parent c846b6aa21
commit 1c8dc1cc68
4 changed files with 72 additions and 48 deletions

View File

@ -206,9 +206,13 @@ void decodeQP(char *p){
i += 2; i += 2;
} }
else if(p[i] == '_'){
c = ' ';
}
p[k] = c; p[k] = c;
k++; k++;
} }
p[k] = '\0'; p[k] = '\0';

View File

@ -320,13 +320,12 @@ void fixupEncodedHeaderLine(char *buf, int buflen){
* but then I saw a 6-7000 byte long subject line, so I've switched to MAXBUFSIZE * but then I saw a 6-7000 byte long subject line, so I've switched to MAXBUFSIZE
*/ */
char v[MAXBUFSIZE], u[MAXBUFSIZE], puf[MAXBUFSIZE], encoding[MAXBUFSIZE], tmpbuf[2*MAXBUFSIZE]; char v[MAXBUFSIZE], u[MAXBUFSIZE], puf[MAXBUFSIZE], encoding[MAXBUFSIZE], tmpbuf[2*MAXBUFSIZE];
int need_encoding, ret; int need_encoding, ret, prev_encoded=0, n_tokens=0;
int b64=0, qp=0;
if(buflen < 5) return; if(buflen < 5) return;
memset(puf, 0, sizeof(puf)); memset(puf, 0, sizeof(puf));
memset(encoding, 0, sizeof(encoding));
q = buf; q = buf;
@ -342,18 +341,46 @@ void fixupEncodedHeaderLine(char *buf, int buflen){
* We can't use split_str(p, "=?", ...) it will fail with the following pattern * We can't use split_str(p, "=?", ...) it will fail with the following pattern
* =?UTF-8?B?SG9neWFuIMOtcmp1bmsgcGFuYXN6bGV2ZWxldD8=?= * =?UTF-8?B?SG9neWFuIMOtcmp1bmsgcGFuYXN6bGV2ZWxldD8=?=
* *
* Also the below patter requires special care: * Also the below pattern requires special care:
* =?gb2312?B?<something>?==?gb2312?Q?<something else>?= * =?gb2312?B?<something>?==?gb2312?Q?<something else>?=
*
* And we have to check the following cases as well:
* Happy New Year! =?utf-8?q?=F0=9F=8E=86?=
*/ */
b64 = qp = 0;
memset(encoding, 0, sizeof(encoding));
r = strstr(p, "=?"); r = strstr(p, "=?");
if(r){ if(r){
p = r + 2; p = r + 2;
e = strchr(p, '?');
if(e){
*e = '\0';
snprintf(encoding, sizeof(encoding)-1, "%s", p);
*e = '?';
s = strcasestr(e, "?B?");
if(s){
b64 = 1;
p = s + 3;
}
else {
s = strcasestr(e, "?Q?");
if(s){
qp = 1;
p = s + 3;
}
}
}
end = strstr(p, "?="); end = strstr(p, "?=");
if(end){ if(end){
*end = '\0'; *end = '\0';
} }
snprintf(u, sizeof(u)-1, "%s", p); snprintf(u, sizeof(u)-1, "%s", p);
if(end) { if(end) {
@ -367,47 +394,38 @@ void fixupEncodedHeaderLine(char *buf, int buflen){
if(u[0] == 0) continue; if(u[0] == 0) continue;
memset(encoding, 0, sizeof(encoding)); n_tokens++;
// Check if it's either ?B? or ?Q? encoding ... if(b64 == 1) decodeBase64(u);
s = strcasestr(u, "?B?"); else if(qp == 1) decodeQP(u);
if(s){
decodeBase64(s+3);
} /*
else { * https://www.ietf.org/rfc/rfc2047.txt says that
s = strcasestr(u, "?Q?"); *
if(s){ * "When displaying a particular header field that contains multiple
decodeQP(s+3); * 'encoded-word's, any 'linear-white-space' that separates a pair of
r = s + 3; * adjacent 'encoded-word's is ignored." (6.2)
for(; *r; r++){ */
if(*r == '_') *r = ' '; if(prev_encoded == 1 && (b64 == 1 || qp == 1)) {}
} else if(n_tokens > 1){
} strncat(puf, " ", sizeof(puf)-strlen(puf)-1);
} }
// ... if it is, then get the encoding if(b64 == 1 || qp == 1){
if(s){ prev_encoded = 1;
e = strchr(u, '?');
if(e){
*e = '\0';
snprintf(encoding, sizeof(encoding)-1, "%s", u);
*e = '?';
need_encoding = 0; need_encoding = 0;
ret = ERR; ret = ERR;
if(encoding[0] && strcasecmp(encoding, "utf-8")){ if(encoding[0] && strcasecmp(encoding, "utf-8")){
need_encoding = 1; need_encoding = 1;
ret = utf8_encode(s+3, strlen(s+3), &tmpbuf[0], sizeof(tmpbuf), encoding); ret = utf8_encode(u, strlen(u), &tmpbuf[0], sizeof(tmpbuf), encoding);
} }
if(need_encoding == 1 && ret == OK) if(need_encoding == 1 && ret == OK){
strncat(puf, tmpbuf, sizeof(puf)-strlen(puf)-1); strncat(puf, tmpbuf, sizeof(puf)-strlen(puf)-1);
else
strncat(puf, s+3, sizeof(puf)-strlen(puf)-1);
} }
else { else {
memset(encoding, 0, sizeof(encoding));
strncat(puf, u, sizeof(puf)-strlen(puf)-1); strncat(puf, u, sizeof(puf)-strlen(puf)-1);
} }
} }
@ -417,8 +435,6 @@ void fixupEncodedHeaderLine(char *buf, int buflen){
} while(p); } while(p);
if(q && encoding[0] == 0) strncat(puf, " ", sizeof(puf)-strlen(puf)-1);
} while(q); } while(q);
snprintf(buf, buflen-1, "%s", puf); snprintf(buf, buflen-1, "%s", puf);

View File

@ -154,6 +154,8 @@ static void test_fixupEncodedHeaderLine(){
{"Subject: =?windows-1251?B?ze7i7uPu5O3o5SDv7uTg8OroIOTr/yDC4Pjo?=", "Subject: Новогодние подарки для Ваши"}, {"Subject: =?windows-1251?B?ze7i7uPu5O3o5SDv7uTg8OroIOTr/yDC4Pjo?=", "Subject: Новогодние подарки для Ваши"},
{"Subject: =?utf-8?Q?Divatos,_=C3=BCde_sz=C3=ADneinek_k=C3=B6sz=C3=B6nhet=C5=91en_el?=", "Subject: Divatos, üde színeinek köszönhetően el"}, {"Subject: =?utf-8?Q?Divatos,_=C3=BCde_sz=C3=ADneinek_k=C3=B6sz=C3=B6nhet=C5=91en_el?=", "Subject: Divatos, üde színeinek köszönhetően el"},
{"=?gb2312?B?yc/Gz76pIC0gw7/fTMir0bKy6YjzuOYgKDIwMTcxMDMwLTMxKSBHQlcgUG9k?==?gb2312?Q?ium_&_Basement.docx?=", "上葡京 - 每週全巡查報告 (20171030-31) GBW Podium & Basement.docx"}, {"=?gb2312?B?yc/Gz76pIC0gw7/fTMir0bKy6YjzuOYgKDIwMTcxMDMwLTMxKSBHQlcgUG9k?==?gb2312?Q?ium_&_Basement.docx?=", "上葡京 - 每週全巡查報告 (20171030-31) GBW Podium & Basement.docx"},
{"Subject: =?UTF-8?Q?=E2=98=85_JubiDu!Versandkost?= =?UTF-8?Q?enfrei-Verl=C3=A4ngerung!=E2=98=85?=", "Subject: ★ JubiDu!Versandkostenfrei-Verlängerung!★"},
{"Happy New Year! =?utf-8?q?=F0=9F=8E=86?=", "Happy New Year! 🎆"},
}; };
TEST_HEADER(); TEST_HEADER();

View File

@ -5,8 +5,10 @@ set -o pipefail
set -o nounset set -o nounset
set -x set -x
LD_LIBRARY_PATH=../src ./check_parser_utils export LD_LIBRARY_PATH=../src
LD_LIBRARY_PATH=../src ./check_parser
LD_LIBRARY_PATH=../src ./check_rules ./check_parser_utils
LD_LIBRARY_PATH=../src ./check_digest ./check_parser
LD_LIBRARY_PATH=../src ./check_mydomains ./check_rules
./check_digest
./check_mydomains