From 5551df3f9dc1fac3b795c47342913867c6426b17 Mon Sep 17 00:00:00 2001 From: SJ Date: Sat, 30 Aug 2014 21:10:29 +0200 Subject: [PATCH] decoding fixes --- src/config.h | 4 ++-- src/decoder.c | 52 +++++++++++++++++++++------------------------- src/decoder.h | 4 ++-- src/defs.h | 1 + src/html.h | 38 --------------------------------- src/misc.c | 4 +++- src/parser.c | 13 ++++++++---- src/parser_utils.c | 28 +++++-------------------- 8 files changed, 46 insertions(+), 98 deletions(-) delete mode 100644 src/html.h diff --git a/src/config.h b/src/config.h index 15fabc79..dec29f83 100644 --- a/src/config.h +++ b/src/config.h @@ -12,9 +12,9 @@ #define PROGNAME "piler" #define PILERGETD_PROGNAME "pilergetd" -#define VERSION "1.1.0" +#define VERSION "1.1.1" -#define BUILD 884 +#define BUILD 885 #define HOSTID "mailarchiver" diff --git a/src/decoder.c b/src/decoder.c index ed1bf433..5436ad1d 100644 --- a/src/decoder.c +++ b/src/decoder.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "decoder.h" #include "htmlentities.h" #include "config.h" @@ -182,7 +183,7 @@ void decodeQP(char *p){ } -void decodeHTML(char *p){ +void decodeHTML(char *p, int utf8){ unsigned char buf[MAXBUFSIZE], __u[8]; char *s, *q; int count=0, len, c; @@ -212,9 +213,16 @@ void decodeHTML(char *p){ res = bsearch(&key, htmlentities, NUM_OF_HTML_ENTITIES, sizeof(struct mi), compmi); if(res && res->val <= 255){ - utf8_encode_char(res->val, &__u[0], sizeof(__u), &len); - memcpy(&buf[count], &__u[0], len); - count += len; + + if(utf8 == 1){ + utf8_encode_char(res->val, &__u[0], sizeof(__u), &len); + memcpy(&buf[count], &__u[0], len); + count += len; + } + else { + buf[count] = res->val; + count++; + } } else { buf[count] = 'q'; @@ -316,37 +324,25 @@ inline void utf8_encode_char(unsigned char c, unsigned char *buf, int buflen, in } -void utf8_encode(unsigned char *p){ - int count=0, len; - unsigned char *u, *s, utf8[MAXBUFSIZE], __u[8]; +int utf8_encode(char *inbuf, int inbuflen, char *outbuf, int outbuflen, char *encoding){ + iconv_t cd; + size_t size, inbytesleft, outbytesleft; - if(p == NULL || strlen((char *)p) == 0) return; + memset(outbuf, 0, outbuflen); - memset(utf8, 0, MAXBUFSIZE); - u = &utf8[0]; - s = p; + cd = iconv_open("utf-8", encoding); - for(; *s; s++){ + if(cd != (iconv_t)-1){ + inbytesleft = inbuflen; + outbytesleft = outbuflen-1; - utf8_encode_char(*s, &__u[0], sizeof(__u), &len); + size = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - /* - * this condition should never happen, as according to the RFCs: - * - * "Each line of characters MUST be no more than 998 characters, and - * SHOULD be no more than 78 characters, excluding the CRLF." - * - */ + iconv_close(cd); - if(count+len > sizeof(utf8)-1) break; - - //printf("%s", __u); - memcpy(u+count, &__u[0], len); - - count += len; + if(size >= 0) return OK; } - *(u+count) = '\0'; count++; - memcpy(p, u, count); + return ERR; } diff --git a/src/decoder.h b/src/decoder.h index c582cd90..85558486 100644 --- a/src/decoder.h +++ b/src/decoder.h @@ -11,9 +11,9 @@ void sanitiseBase64(char *s); int decodeBase64(char *p); int decode_base64_to_buffer(char *p, int plen, unsigned char *b, int blen); void decodeQP(char *p); -void decodeHTML(char *p); +void decodeHTML(char *p, int utf8); void decodeURL(char *p); inline void utf8_encode_char(unsigned char c, unsigned char *buf, int buflen, int *len); -void utf8_encode(unsigned char *p); +int utf8_encode(char *inbuf, int inbuflen, char *outbuf, int outbuflen, char *encoding); #endif /* _DECODER_H */ diff --git a/src/defs.h b/src/defs.h index 859b4f06..e5118bba 100644 --- a/src/defs.h +++ b/src/defs.h @@ -165,6 +165,7 @@ struct _state { char filename[TINYBUFSIZE]; char type[TINYBUFSIZE]; + char charset[TINYBUFSIZE]; char attachment_name_buf[SMALLBUFSIZE]; int anamepos; diff --git a/src/html.h b/src/html.h deleted file mode 100644 index 37424e91..00000000 --- a/src/html.h +++ /dev/null @@ -1,38 +0,0 @@ - -struct html_tag { - unsigned char length; - char *entity; -}; - -#define NUM_OF_SKIP_TAGS2 10 - -struct html_tag skip_html_tags2[] = { - { 4, "html" }, - { 5, "/html" }, - { 5, "/body" }, - { 4, "meta" }, - { 4, "head" }, - { 5, "/head" }, - { 5, "style" }, - { 6, "/style" }, - { 3, "div" }, - { 4, "/div" } -}; - - -#define NUM_OF_SKIP_TAGS 11 - -struct html_tag skip_html_tags[] = { - { 5, "style" }, - { 4, "dir=" }, - { 8, "content=" }, - { 5, "name=" }, - { 3, "id=" }, - { 2, "v:" }, - { 6, "class=" }, - { 5, "xmlns" }, - { 10, "http-equiv" }, - { 7, "spidmax" }, - { 5, "data=" } -}; - diff --git a/src/misc.c b/src/misc.c index 09993521..b6a1f392 100644 --- a/src/misc.c +++ b/src/misc.c @@ -548,7 +548,9 @@ int read_from_stdin(struct session_data *sdata){ void strtolower(char *s){ - for(; *s; s++) *s = tolower(*s); + for(; *s; s++){ + if(*s >= 65 && *s <= 90) *s = tolower(*s); + } } diff --git a/src/parser.c b/src/parser.c index 9b416ad6..3eb70a43 100644 --- a/src/parser.c +++ b/src/parser.c @@ -173,6 +173,7 @@ void storno_attachment(struct _state *state){ int parse_line(char *buf, struct _state *state, struct session_data *sdata, int take_into_pieces, char *writebuffer, int writebuffersize, char *abuffer, int abuffersize, struct __data *data, struct __config *cfg){ char *p, *q, puf[SMALLBUFSIZE]; unsigned char b64buffer[MAXBUFSIZE]; + char tmpbuf[MAXBUFSIZE]; int n64, len, writelen, boundary_line=0, result; if(cfg->debug == 1) printf("line: %s", buf); @@ -501,7 +502,8 @@ int parse_line(char *buf, struct _state *state, struct session_data *sdata, int } - if(strcasestr(buf, "charset") && strcasestr(buf, "UTF-8")) state->utf8 = 1; + if(strcasestr(buf, "charset")) extractNameFromHeaderLine(buf, "charset", state->charset); + if(strcasestr(state->charset, "UTF-8")) state->utf8 = 1; } @@ -577,6 +579,7 @@ int parse_line(char *buf, struct _state *state, struct session_data *sdata, int memset(state->filename, 0, TINYBUFSIZE); memset(state->type, 0, TINYBUFSIZE); + snprintf(state->charset, TINYBUFSIZE-1, "unknown"); memset(state->attachment_name_buf, 0, SMALLBUFSIZE); state->anamepos = 0; @@ -617,11 +620,13 @@ int parse_line(char *buf, struct _state *state, struct session_data *sdata, int /* I believe that we can live without this function call */ //decodeURL(buf); - if(state->texthtml == 1) decodeHTML(buf); + if(state->texthtml == 1) decodeHTML(buf, state->utf8); /* encode the body if it's not utf-8 encoded */ - if(state->message_state == MSG_BODY && state->utf8 != 1) utf8_encode((unsigned char*)buf); - + if(state->message_state == MSG_BODY && state->utf8 != 1){ + result = utf8_encode(buf, strlen(buf), &tmpbuf[0], sizeof(tmpbuf), state->charset); + if(result == OK) snprintf(buf, MAXBUFSIZE-1, "%s", tmpbuf); + } translateLine((unsigned char*)buf, state); diff --git a/src/parser_utils.c b/src/parser_utils.c index cedaae02..3ab8688f 100644 --- a/src/parser_utils.c +++ b/src/parser_utils.c @@ -15,10 +15,8 @@ #include #include #include -#include #include #include "trans.h" -#include "html.h" void init_state(struct _state *state){ @@ -328,10 +326,7 @@ int extract_boundary(char *p, struct _state *state){ void fixupEncodedHeaderLine(char *buf, int buflen){ char *sb, *sq, *p, *q, *r, *s, *e, *start, *end; char v[SMALLBUFSIZE], puf[MAXBUFSIZE], encoding[SMALLBUFSIZE], tmpbuf[2*SMALLBUFSIZE]; - iconv_t cd; - size_t size, inbytesleft, outbytesleft; - char *inbuf, *outbuf; - int need_encoding; + int need_encoding, ret; if(buflen < 5) return; @@ -376,29 +371,16 @@ void fixupEncodedHeaderLine(char *buf, int buflen){ if(sq){ decodeQP(s+3); r = s + 3; for(; *r; r++){ if(*r == '_') *r = ' '; } } /* encode everything if it's not utf-8 encoded */ - //if(strncasecmp(start+1, "utf-8", 5)) utf8_encode((unsigned char*)s+3); - //strncat(puf, s+3, sizeof(puf)-1); - size = need_encoding = 0; + need_encoding = 0; + ret = ERR; if(strlen(encoding) > 2 && strcasecmp(encoding, "utf-8")){ need_encoding = 1; - memset(tmpbuf, 0, sizeof(tmpbuf)); - - cd = iconv_open("utf-8", encoding); - - if(cd != (iconv_t)-1){ - inbuf = s+3; - outbuf = &tmpbuf[0]; - inbytesleft = strlen(s+3); - outbytesleft = sizeof(tmpbuf)-1; - size = iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft); - iconv_close(cd); - } - else { syslog(LOG_PRIORITY, "unsupported encoding: '%s'", encoding); } + ret = utf8_encode(s+3, strlen(s+3), &tmpbuf[0], sizeof(tmpbuf), encoding); } - if(need_encoding == 1 && size >= 0) + if(need_encoding == 1 && ret == OK) strncat(puf, tmpbuf, sizeof(puf)-1); else strncat(puf, s+3, sizeof(puf)-1);