From fd4184daa947a345ea4f676fe02f5503bdc70b64 Mon Sep 17 00:00:00 2001 From: SJ Date: Mon, 3 Feb 2014 15:38:10 +0100 Subject: [PATCH] fixed the date parsing --- src/config.h | 2 +- src/parser_utils.c | 102 ++++++++++++++++++++++++++++++++------------- test/parser.c | 35 ++++++++++++++-- 3 files changed, 106 insertions(+), 33 deletions(-) diff --git a/src/config.h b/src/config.h index 9d3a0bf8..10bf9f51 100644 --- a/src/config.h +++ b/src/config.h @@ -14,7 +14,7 @@ #define VERSION "0.1.25-master-branch" -#define BUILD 858 +#define BUILD 859 #define HOSTID "mailarchiver" diff --git a/src/parser_utils.c b/src/parser_utils.c index 7c4eee51..330322cf 100644 --- a/src/parser_utils.c +++ b/src/parser_utils.c @@ -111,7 +111,7 @@ long get_local_timezone_offset(){ unsigned long parse_date_header(char *datestr, struct __config *cfg){ - int n=0; + int n=0, len; long offset=0; unsigned long ts=0; char *p, *q, *r, s[SMALLBUFSIZE]; @@ -120,9 +120,17 @@ unsigned long parse_date_header(char *datestr, struct __config *cfg){ datestr += 5; p = datestr; + tm.tm_year = 0; + tm.tm_mon = 0; + tm.tm_mday = 0; + tm.tm_wday = 0; + tm.tm_hour = 0; + tm.tm_min = 0; + tm.tm_sec = 0; + tm.tm_isdst = -1; for(; *datestr; datestr++){ - if(isspace(*datestr)) *datestr = ' '; + if(isspace(*datestr) || *datestr == '.' || *datestr == ',') *datestr = ' '; } @@ -132,14 +140,26 @@ unsigned long parse_date_header(char *datestr, struct __config *cfg){ p = split_str(p, " ", s, sizeof(s)-1); if(strlen(s) > 0){ n++; + len = strlen(s); - q = strchr(s, ','); if(q) *q='\0'; + /* + * A proper Date: header should look like this: + * + * Date: Mon, 3 Feb 2014 13:21:07 +0100 + * + * + * However some email applications provide crap, eg. + * + * Sat, 4 Aug 2007 13:36:52 GMT-0700 + * Sat, 4 Aug 07 13:36:52 GMT-0700 + * 16 Dec 07 20:45:52 + * 03 Jun 06 05:59:00 +0100 + * 30.06.2005 17:47:42 + * + * [wday] mday mon year h:m:s offset + */ - if(strlen(s) <= 2){ tm.tm_mday = atoi(s); continue; } - - if(strlen(s) == 4){ tm.tm_year = atoi(s) - 1900; continue; } - - if(strlen(s) == 3){ + if(n == 1 && len == 3){ if(strcmp(s, "Mon") == 0) tm.tm_wday = 1; else if(strcmp(s, "Tue") == 0) tm.tm_wday = 2; else if(strcmp(s, "Wed") == 0) tm.tm_wday = 3; @@ -147,41 +167,65 @@ unsigned long parse_date_header(char *datestr, struct __config *cfg){ else if(strcmp(s, "Fri") == 0) tm.tm_wday = 5; else if(strcmp(s, "Sat") == 0) tm.tm_wday = 6; else if(strcmp(s, "Sun") == 0) tm.tm_wday = 0; - - - if(strcmp(s, "Jan") == 0) tm.tm_mon = 0; - else if(strcmp(s, "Feb") == 0) tm.tm_mon = 1; - else if(strcmp(s, "Mar") == 0) tm.tm_mon = 2; - else if(strcmp(s, "Apr") == 0) tm.tm_mon = 3; - else if(strcmp(s, "May") == 0) tm.tm_mon = 4; - else if(strcmp(s, "Jun") == 0) tm.tm_mon = 5; - else if(strcmp(s, "Jul") == 0) tm.tm_mon = 6; - else if(strcmp(s, "Aug") == 0) tm.tm_mon = 7; - else if(strcmp(s, "Sep") == 0) tm.tm_mon = 8; - else if(strcmp(s, "Oct") == 0) tm.tm_mon = 9; - else if(strcmp(s, "Nov") == 0) tm.tm_mon = 10; - else if(strcmp(s, "Dec") == 0) tm.tm_mon = 11; - - continue; } - if(strlen(s) == 8){ + if(n == 1 && len <= 2){ + n++; + } + + if(n == 2 && len <= 2){ tm.tm_mday = atoi(s); continue; } + + if(n == 3){ + if(len == 3){ + if(strcmp(s, "Jan") == 0) tm.tm_mon = 0; + else if(strcmp(s, "Feb") == 0) tm.tm_mon = 1; + else if(strcmp(s, "Mar") == 0) tm.tm_mon = 2; + else if(strcmp(s, "Apr") == 0) tm.tm_mon = 3; + else if(strcmp(s, "May") == 0) tm.tm_mon = 4; + else if(strcmp(s, "Jun") == 0) tm.tm_mon = 5; + else if(strcmp(s, "Jul") == 0) tm.tm_mon = 6; + else if(strcmp(s, "Aug") == 0) tm.tm_mon = 7; + else if(strcmp(s, "Sep") == 0) tm.tm_mon = 8; + else if(strcmp(s, "Oct") == 0) tm.tm_mon = 9; + else if(strcmp(s, "Nov") == 0) tm.tm_mon = 10; + else if(strcmp(s, "Dec") == 0) tm.tm_mon = 11; + + continue; + } + + if(len == 2){ + tm.tm_mon = atoi(s); + continue; + } + } + + + if(n == 4){ + if(len == 4){ tm.tm_year = atoi(s) - 1900; continue; } + if(len == 2){ tm.tm_year = atoi(s); if(tm.tm_year < 70) tm.tm_year += 100; continue; } + } + + + if(n == 5 && len >= 5){ r = &s[0]; q = strchr(r, ':'); if(!q) break; *q = '\0'; tm.tm_hour = atoi(r); r = q+1; - q = strchr(r, ':'); if(!q) break; - *q = '\0'; tm.tm_min = atoi(r); r = q+1; + q = strchr(r, ':'); if(q) *q = '\0'; + tm.tm_min = atoi(r); + + if(len == 8){ + r = q+1; + tm.tm_sec = atoi(r); + } - tm.tm_sec = atoi(r); break; } } } while(p); - tm.tm_isdst = -1; ts = mktime(&tm); if(p && (*p == '+' || *p == '-')){ diff --git a/test/parser.c b/test/parser.c index ec211b87..6ca6ed37 100644 --- a/test/parser.c +++ b/test/parser.c @@ -138,12 +138,38 @@ int test_htmls(){ return count; } +int test_dates(){ + int count=0; + unsigned long ts; + char datestr[SMALLBUFSIZE]; + struct __config cfg; + + cfg.tweak_sent_time_offset = 0; + + snprintf(datestr, sizeof(datestr)-2, "Date: Mon, 3 Feb 2014 13:16:09 +0100"); + ts = parse_date_header(datestr, &cfg); printf("%s => %ld\n", datestr, ts); + + snprintf(datestr, sizeof(datestr)-2, "Date: Sat, 4 Aug 07 13:36:52 GMT-0700"); + ts = parse_date_header(datestr, &cfg); printf("%s => %ld\n", datestr, ts); + + snprintf(datestr, sizeof(datestr)-2, "Date: 23 Sep 09 07:03 -0800"); + ts = parse_date_header(datestr, &cfg); printf("%s => %ld\n", datestr, ts); + + snprintf(datestr, sizeof(datestr)-2, "Date: 16 Dec 07 20:45:52"); + ts = parse_date_header(datestr, &cfg); printf("%s => %ld\n", datestr, ts); + + snprintf(datestr, sizeof(datestr)-2, "Date: 30.06.2005 17:47:42"); + ts = parse_date_header(datestr, &cfg); printf("%s => %ld\n", datestr, ts); + + snprintf(datestr, sizeof(datestr)-2, "Date: 03 Jun 06 05:59:00 +0100"); + ts = parse_date_header(datestr, &cfg); printf("%s => %ld\n", datestr, ts); + + return count; +} + int main(int argc, char **argv){ int n; - //struct __config cfg; - - //cfg = read_config(CONFIG_FILE); n = test_urls(); printf("testing fixURL(), errors: %d\n", n); @@ -156,5 +182,8 @@ int main(int argc, char **argv){ n = test_htmls(); printf("testing markHTML(), errors: %d\n", n); + n = test_dates(); + printf("testing parse_date_header(), errors: %d\n", n); + return 0; }