From 1192fc3218681177114eaeeb5cf667967ad75df4 Mon Sep 17 00:00:00 2001
From: Janos SUTO <sj@acts.hu>
Date: Wed, 8 Nov 2017 11:50:28 +0100
Subject: [PATCH] src: decoder and parser fix

Signed-off-by: Janos SUTO <sj@acts.hu>
---
 src/decoder.c                   |   2 +
 src/parser_utils.c              | 136 +++++++++++++++++++-------------
 unit_tests/check_parser_utils.c |  67 +++++++---------
 unit_tests/test.h               |  24 ++++++
 4 files changed, 134 insertions(+), 95 deletions(-)
 create mode 100644 unit_tests/test.h
diff --git a/src/decoder.c b/src/decoder.c
index 581655c0..5317808d 100644
--- a/src/decoder.c
+++ b/src/decoder.c
@@ -78,6 +78,8 @@ inline void utf8_encode_char(unsigned char c, unsigned char *buf, int buflen, in
        * U+0000..U+007F      00..7F
        * U+0080..U+07FF      C2..DF      80..BF
        * U+0800..U+0FFF      E0          A0..BF      80..BF
+       *
+       * FIXME: See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf#G7404 for valid sequences
        */
 
       if(c <= 0x7F){
diff --git a/src/parser_utils.c b/src/parser_utils.c
index 51476691..b09d6500 100644
--- a/src/parser_utils.c
+++ b/src/parser_utils.c
@@ -196,6 +196,7 @@ time_t parse_date_header(char *datestr){
          else if(strncasecmp(s, "Sat", 3) == 0) tm.tm_wday = 6;
          else if(strncasecmp(s, "Sun", 3) == 0) tm.tm_wday = 0;
 
+
          if(len <= 2 && tm.tm_mday == 0){ tm.tm_mday = atoi(s); continue; }
 
          if(len <= 2 && tm.tm_mon == -1){ tm.tm_mon = atoi(s) - 1; continue; }
@@ -313,17 +314,19 @@ int extract_boundary(char *p, struct parser_state *state){
 
 
 void fixupEncodedHeaderLine(char *buf, int buflen){
-   char *sb, *sq, *p, *q, *r, *s, *e, *start, *end;
+   char *p, *q, *r, *s, *e, *end;
    /*
     * I thought SMALLBUFSIZE would be enough for v, encoding and tmpbuf(2*),
     * but then I saw a 6-7000 byte long subject line, so I've switched to MAXBUFSIZE
     */
-   char v[MAXBUFSIZE], puf[MAXBUFSIZE], encoding[MAXBUFSIZE], tmpbuf[2*MAXBUFSIZE];
+   char v[MAXBUFSIZE], u[MAXBUFSIZE], puf[MAXBUFSIZE], encoding[MAXBUFSIZE], tmpbuf[2*MAXBUFSIZE];
    int need_encoding, ret;
 
    if(buflen < 5) return;
 
    memset(puf, 0, sizeof(puf));
+   memset(encoding, 0, sizeof(encoding));
+
 
    q = buf;
 
@@ -332,69 +335,89 @@ void fixupEncodedHeaderLine(char *buf, int buflen){
 
       p = v;
 
-      memset(encoding, 0, sizeof(encoding));
-
       do {
-         start = strstr(p, "=?");
-         if(start){
-            *start = '\0';
-            if(strlen(p) > 0){
-               strncat(puf, p, sizeof(puf)-strlen(puf)-1);
+         memset(u, 0, sizeof(u));
+
+         /*
+          * We can't use split_str(p, "=?", ...) it will fail with the following pattern
+          *    =?UTF-8?B?SG9neWFuIMOtcmp1bmsgcGFuYXN6bGV2ZWxldD8=?=
+          *
+          * Also the below patter requires special care:
+          *    =?gb2312?B?<something>?==?gb2312?Q?<something else>?=
+          */
+
+         r = strstr(p, "=?");
+         if(r){
+            p = r + 2;
+            end = strstr(p, "?=");
+            if(end){
+               *end = '\0';
             }
 
-            start++;
+            snprintf(u, sizeof(u)-1, "%s", p);
 
-            e = strchr(start+2, '?');
-            if(e){
-               *e = '\0';
-               snprintf(encoding, sizeof(encoding)-1, "%s", start+1);
-               *e = '?';
-            }
-
-            s = NULL;
-            sb = strcasestr(start, "?B?"); if(sb) s = sb;
-            sq = strcasestr(start, "?Q?"); if(sq) s = sq;
-
-            if(s){
-               end = strstr(s+3, "?=");
-               if(end){
-                  *end = '\0';
-
-                  if(sb){ decodeBase64(s+3); }
-                  if(sq){ decodeQP(s+3); r = s + 3; for(; *r; r++){ if(*r == '_') *r = ' '; } }
-
-                  /* encode everything if it's not utf-8 encoded */
-
-                  need_encoding = 0;
-                  ret = ERR;
-
-                  if(strlen(encoding) > 2 && strcasecmp(encoding, "utf-8")){
-                     need_encoding = 1;
-                     ret = utf8_encode(s+3, strlen(s+3), &tmpbuf[0], sizeof(tmpbuf), encoding);
-                  }
-
-                  if(need_encoding == 1 && ret == OK)
-                     strncat(puf, tmpbuf, sizeof(puf)-strlen(puf)-1);
-                  else 
-                     strncat(puf, s+3, sizeof(puf)-strlen(puf)-1);
-
-                  p = end + 2;
-               }
-            }
-            else {
-               strncat(puf, start, sizeof(puf)-strlen(puf)-1);
-
-               break;
+            if(end) {
+               p = end + 2;
             }
          }
          else {
-            strncat(puf, p, sizeof(puf)-strlen(puf)-1);
-            break;
+            snprintf(u, sizeof(u)-1, "%s", p);
+            p = NULL;
+         }
+
+         if(u[0] == 0) continue;
+
+         memset(encoding, 0, sizeof(encoding));
+
+         // Check if it's either ?B? or ?Q? encoding ...
+         s = strcasestr(u, "?B?");
+         if(s){
+            decodeBase64(s+3);
+         }
+         else {
+            s = strcasestr(u, "?Q?");
+            if(s){
+               decodeQP(s+3);
+               r = s + 3;
+               for(; *r; r++){
+                  if(*r == '_') *r = ' ';
+               }
+            }
+         }
+
+         // ... if it is, then get the encoding
+         if(s){
+            e = strchr(u, '?');
+            if(e){
+               *e = '\0';
+               snprintf(encoding, sizeof(encoding)-1, "%s", u);
+               *e = '?';
+
+               need_encoding = 0;
+               ret = ERR;
+
+               if(encoding[0] && strcasecmp(encoding, "utf-8")){
+                  need_encoding = 1;
+                  ret = utf8_encode(s+3, strlen(s+3), &tmpbuf[0], sizeof(tmpbuf), encoding);
+               }
+
+               if(need_encoding == 1 && ret == OK)
+                  strncat(puf, tmpbuf, sizeof(puf)-strlen(puf)-1);
+               else
+                  strncat(puf, s+3, sizeof(puf)-strlen(puf)-1);
+            }
+            else {
+               memset(encoding, 0, sizeof(encoding));
+               strncat(puf, u, sizeof(puf)-strlen(puf)-1);
+            }
+         }
+         else {
+            strncat(puf, u, sizeof(puf)-strlen(puf)-1);
          }
 
       } while(p);
 
-      if(q) strncat(puf, " ", sizeof(puf)-strlen(puf)-1);
+      if(q && encoding[0] == 0) strncat(puf, " ", sizeof(puf)-strlen(puf)-1);
 
    } while(q);
 
@@ -599,6 +622,7 @@ void translateLine(unsigned char *p, struct parser_state *state){
          prev = *p;
       }
 
+
       if(state->message_state == MSG_SUBJECT && (*p == '%' || *p == '_' || *p == '&') ){ continue; }
 
       if(state->message_state == MSG_CONTENT_TYPE && *p == '_' ){ continue; }
@@ -658,8 +682,7 @@ int does_it_seem_like_an_email_address(char *email){
  */
 
 void reassembleToken(char *p){
-   unsigned int i;
-   int k=0;
+   unsigned int i, k=0;
 
    for(i=0; i<strlen(p); i++){
 
@@ -959,3 +982,4 @@ void fix_plus_sign_in_email_address(char *puf, char **at_sign, unsigned int *len
       *at_sign = r;
    }
 }
+
diff --git a/unit_tests/check_parser_utils.c b/unit_tests/check_parser_utils.c
index 25b80fb2..d87a8371 100644
--- a/unit_tests/check_parser_utils.c
+++ b/unit_tests/check_parser_utils.c
@@ -2,12 +2,7 @@
  * check_parser_utils.c, SJ
  */
 
-#include <stdio.h>
-#include <string.h>
-#include <locale.h>
-#include <stdbool.h>
-#include <assert.h>
-#include "../src/piler.h"
+#include "test.h"
 
 
 struct date_test {
@@ -29,9 +24,9 @@ struct str_pair {
 
 static void test_parse_date_header(){
    unsigned int i;
-   int dst_fix = 0;
-   time_t t = time(NULL);
-   struct tm lt = {0};
+   //time_t t = time(NULL);
+   //int dst_fix = 0;
+   //struct tm lt = {0};
    struct config cfg;
    struct date_test date_test[] = {
       {"Date: Mon, 02 Nov 2015 09:39:31 -0000", 1446457171},
@@ -53,22 +48,22 @@ static void test_parse_date_header(){
    setlocale(LC_MESSAGES, cfg.locale);
    setlocale(LC_CTYPE, cfg.locale);
 
-   localtime_r(&t, &lt);
+   /*localtime_r(&t, &lt);
    if(lt.tm_isdst == 1){
       printf("DST is on\n");
       dst_fix = 3600;
    }
    else {
       printf("DST is off\n");
-   }
+   }*/
+
+   TEST_HEADER();
 
    for(i=0; i<sizeof(date_test)/sizeof(struct date_test); i++){
-      printf("%s parsed=%ld, control=%ld\n", date_test[i].date_str, parse_date_header(date_test[i].date_str), date_test[i].timestamp);
-
-      assert(parse_date_header(date_test[i].date_str)-dst_fix == date_test[i].timestamp && "test_parse_date_header()");
+      ASSERT(parse_date_header(date_test[i].date_str) == date_test[i].timestamp, date_test[i].date_str);
    }
 
-   printf("test_parse_date_header() OK\n");
+   TEST_FOOTER();
 }
 
 
@@ -108,13 +103,14 @@ static void test_extractNameFromHeaderLine(){
       {"foo: bar; title*=UTF-8''%c2%a3%20and%20%e2%82%ac%20rates", "title", "£ and € rates"}
    };
 
+   TEST_HEADER();
 
    for(i=0; i<sizeof(name_from_header_test)/sizeof(struct name_from_header_test); i++){
       extractNameFromHeaderLine(name_from_header_test[i].line, name_from_header_test[i].token, resultbuf);
-      assert(strcmp(resultbuf, name_from_header_test[i].expected_result) == 0 && "test_extractNameFromHeaderLine");
+      ASSERT(strcmp(resultbuf, name_from_header_test[i].expected_result) == 0, name_from_header_test[i].expected_result);
    }
 
-   printf("test_extractNameFromHeaderLine() OK\n");
+   TEST_FOOTER();
 }
 
 
@@ -123,7 +119,7 @@ static void test_fixupEncodedHeaderLine(){
    char buf[SMALLBUFSIZE];
    struct str_pair pair[] = {
 
-      {"=?utf-8?Q?Tanjoubi,_azaz_sz=C3=BClet=C3=A9snap!_10_=C3=A9ves_az_I_Love_Su?=  =?utf-8?Q?shi!?=", "Tanjoubi, azaz születésnap! 10 éves az I Love Su  shi!"},
+      {"=?utf-8?Q?Tanjoubi,_azaz_sz=C3=BClet=C3=A9snap!_10_=C3=A9ves_az_I_Love_Su?=  =?utf-8?Q?shi!?=", "Tanjoubi, azaz születésnap! 10 éves az I Love Sushi!"},
       {"=?UTF-8?Q?IAM:_N2YPF_-_#1_Request_new_privilege?=", "IAM: N2YPF - #1 Request new privilege"},
       {"=?UTF-8?B?SG9neWFuIMOtcmp1bmsgcGFuYXN6bGV2ZWxldD8=?=", "Hogyan írjunk panaszlevelet?"},
       {"Re: [Bitbucket] Issue #627: ldap user can't login (jsuto/piler)", "Re: [Bitbucket] Issue #627: ldap user can't login (jsuto/piler)"},
@@ -140,7 +136,7 @@ static void test_fixupEncodedHeaderLine(){
       {"Re: cccc@aaa.fu - e-mail =?UTF-8?B?a8OpcmTDqXM=?=", "Re: cccc@aaa.fu - e-mail kérdés"},
       {"=?WINDOWS-1250?Q?<AZ-17226/1-2015>=20www.xxxxx.com=20new=20virtual=20?=", "<AZ-17226/1-2015> www.xxxxx.com new virtual "},
       {"Re: FW: =?ISO-8859-2?Q?Sopron-Gy=F5r_optikai_sz=E1l_probl=E9?=", "Re: FW: Sopron-Győr optikai szál problé"},
-      {"=?UTF-8?Q?Megh=C3=ADv=C3=B3=20a=20Pulzus=20felm=C3=A9r=C3=A9sre=20/=20Inv?=  =?UTF-8?Q?itation=20to=20the=20Pulse=20Survey?=", "Meghívó a Pulzus felmérésre / Inv  itation to the Pulse Survey"},
+      {"=?UTF-8?Q?Megh=C3=ADv=C3=B3=20a=20Pulzus=20felm=C3=A9r=C3=A9sre=20/=20Inv?=  =?UTF-8?Q?itation=20to=20the=20Pulse=20Survey?=", "Meghívó a Pulzus felmérésre / Invitation to the Pulse Survey"},
       {"=?iso-8859-2?Q?vhost_l=E9trehoz=E1sa?=", "vhost létrehozása"},
       {"Re: MAIL =?UTF-8?B?U1pPTEfDgUxUQVTDgVMgSElCQSAgIEdUUzogOTE1NDUyMQ==?=", "Re: MAIL SZOLGÁLTATÁS HIBA   GTS: 9154521"},
       {"[spam???]  Better Sex. Better Body. Better Life.", "[spam???]  Better Sex. Better Body. Better Life."},
@@ -157,20 +153,20 @@ static void test_fixupEncodedHeaderLine(){
       {"Subject: =?UTF-8?Q?Experience=20a=20Crazy=20Reward=20Delivered=20to=20you?=", "Subject: Experience a Crazy Reward Delivered to you"},
       {"Subject: =?windows-1251?B?ze7i7uPu5O3o5SDv7uTg8OroIOTr/yDC4Pjo?=", "Subject: Новогодние подарки для Ваши"},
       {"Subject: =?utf-8?Q?Divatos,_=C3=BCde_sz=C3=ADneinek_k=C3=B6sz=C3=B6nhet=C5=91en_el?=", "Subject: Divatos, üde színeinek köszönhetően el"},
+      {"=?gb2312?B?yc/Gz76pIC0gw7/fTMir0bKy6YjzuOYgKDIwMTcxMDMwLTMxKSBHQlcgUG9k?==?gb2312?Q?ium_&_Basement.docx?=", "上葡京 - 每週全巡查報告 (20171030-31) GBW Podium & Basement.docx"},
    };
 
+   TEST_HEADER();
 
    for(i=0; i<sizeof(pair)/sizeof(struct str_pair); i++){
       snprintf(buf, sizeof(buf)-1, "%s", pair[i].line);
 
       fixupEncodedHeaderLine(buf, sizeof(buf)-1);
 
-      assert(strcmp(buf, pair[i].expected_result) == 0 && "test_fixupEncodedHeaderLine");
-
-      //printf("      {\"%s\", \"%s\"},\n", pair[i].line, buf);
+      ASSERT(strcmp(buf, pair[i].expected_result) == 0, pair[i].expected_result);
    }
 
-   printf("test_fixupEncodedHeaderLine() OK\n");
+   TEST_FOOTER();
 }
 
 
@@ -191,6 +187,7 @@ static void test_translateLine(){
        */
    };
 
+   TEST_HEADER();
 
    for(i=0; i<sizeof(pair)/sizeof(struct str_pair); i++){
 
@@ -202,12 +199,10 @@ static void test_translateLine(){
 
       translateLine((unsigned char*)buf, &state);
 
-      //printf("      {\"%s\", \"%s\"},\n", pair[i].line, buf);
-
-      assert(strcmp(buf, pair[i].expected_result) == 0 && "test_translateLine");
+      ASSERT(strcmp(buf, pair[i].expected_result) == 0, pair[i].expected_result);
    }
 
-   printf("test_translateLine() OK\n");
+   TEST_FOOTER();
 }
 
 
@@ -223,6 +218,7 @@ static void test_fixURL(){
       {"https://www.aaa.fu/", "__URL__wwwXaaaXfu "}
    };
 
+   TEST_HEADER();
 
    for(i=0; i<sizeof(pair)/sizeof(struct str_pair); i++){
 
@@ -230,14 +226,10 @@ static void test_fixURL(){
 
       fixURL(buf, sizeof(buf)-1);
 
-      //printf("      {\"%s\", \"%s\"},\n", pair[i].line, buf);
-
-      assert(strcmp(buf, pair[i].expected_result) == 0 && "test_fixURL");
-
+      ASSERT(strcmp(buf, pair[i].expected_result) == 0, pair[i].expected_result);
    }
 
-   printf("test_fixURL() OK\n");
-
+   TEST_FOOTER();
 }
 
 
@@ -257,6 +249,7 @@ static void test_degenerateToken(){
       {"Hello...", "Hello"}
    };
 
+   TEST_HEADER();
 
    for(i=0; i<sizeof(pair)/sizeof(struct str_pair); i++){
 
@@ -264,14 +257,10 @@ static void test_degenerateToken(){
 
       degenerateToken((unsigned char*)buf);
 
-      //printf("      {\"%s\", \"%s\"},\n", pair[i].line, buf);
-
-      assert(strcmp(buf, pair[i].expected_result) == 0 && "test_degenerateToken");
-
+      ASSERT(strcmp(buf, pair[i].expected_result) == 0, pair[i].expected_result);
    }
 
-   printf("test_degenerateToken() OK\n");
-
+   TEST_FOOTER();
 }
 
 
diff --git a/unit_tests/test.h b/unit_tests/test.h
new file mode 100644
index 00000000..6d7b638d
--- /dev/null
+++ b/unit_tests/test.h
@@ -0,0 +1,24 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <netdb.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+#include <fcntl.h>
+#include <locale.h>
+#include <getopt.h>
+#include <stdbool.h>
+#include <assert.h>
+#include <openssl/ssl.h>
+#include <openssl/err.h>
+#include "../src/piler.h"
+
+
+#define ASSERT(expr, value) if (!(expr)) { printf("assert failed: '%s'\n", value); abort(); } else { printf("."); }
+#define TEST_HEADER() printf("%s() ", __func__);
+#define TEST_FOOTER() printf(" OK\n");
+