From b4854e312ab69e310541816f617bad2778c187d6 Mon Sep 17 00:00:00 2001 From: SJ Date: Sun, 9 Sep 2012 23:16:09 +0200 Subject: [PATCH] added odf, ms office attachment support --- configure | 151 +++++++++++++++++++++++++++++++++++++++++++++ configure.in | 45 ++++++++++++++ piler-config.h.in | 5 ++ src/defs.h | 1 + src/extract.c | 143 ++++++++++++++++++++++++++++++++++++++++-- src/extract.h | 4 +- src/parser.c | 27 ++++---- src/parser.h | 1 + src/parser_utils.c | 48 +++++++++++++- src/test.c | 7 ++- 10 files changed, 410 insertions(+), 22 deletions(-) diff --git a/configure b/configure index bb11075e..8e6be26f 100755 --- a/configure +++ b/configure @@ -3412,9 +3412,14 @@ have_clamd="no" have_antivirus="no" have_mysql="no" have_tre="no" +have_zip="no" have_zlib="no" pdftotext="no" +catdoc="no" +catppt="no" +xls2csv="no" +odt2txt="no" have_static_build="no" @@ -3782,6 +3787,98 @@ fi +for ac_header in zip.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "zip.h" "ac_cv_header_zip_h" "$ac_includes_default" +if test "x$ac_cv_header_zip_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_ZIP_H 1 +_ACEOF + have_zip=yes +else + echo "zip.h is not found" +fi + +done + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for main in -lzip" >&5 +$as_echo_n "checking for main in -lzip... " >&6; } +if ${ac_cv_lib_zip_main+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lzip $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + +int +main () +{ +return main (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_zip_main=yes +else + ac_cv_lib_zip_main=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_zip_main" >&5 +$as_echo "$ac_cv_lib_zip_main" >&6; } +if test "x$ac_cv_lib_zip_main" = xyes; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for zip_open in -lzip" >&5 +$as_echo_n "checking for zip_open in -lzip... " >&6; } +if ${ac_cv_lib_zip_zip_open+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lzip $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char zip_open (); +int +main () +{ +return zip_open (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_zip_zip_open=yes +else + ac_cv_lib_zip_zip_open=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_zip_zip_open" >&5 +$as_echo "$ac_cv_lib_zip_zip_open" >&6; } +if test "x$ac_cv_lib_zip_zip_open" = xyes; then : + have_zip=yes +else + echo "libzip.so is not found"; have_zip=no +fi + +fi +ac_cv_lib_zip=ac_cv_lib_zip_main + + + for ac_header in zlib.h do : ac_fn_c_check_header_mongrel "$LINENO" "zlib.h" "ac_cv_header_zlib_h" "$ac_includes_default" @@ -4144,6 +4241,16 @@ if test "$have_tre" = "yes"; then antispam_libs="$antispam_libs -ltre" fi +if test "$have_zip" = "yes"; then + echo "zip library: yes" + +cat >>confdefs.h <<_ACEOF +#define HAVE_ZIP "1" +_ACEOF + + antispam_libs="$antispam_libs -lzip" +fi + if test "$have_mysql" = "yes"; then defs="$defs -DNEED_MYSQL" fi @@ -4163,7 +4270,51 @@ _ACEOF fi +if test z`which catdoc 2>/dev/null` != "z"; then + catdoc=`which catdoc` + +cat >>confdefs.h <<_ACEOF +#define HAVE_CATDOC "$catdoc" +_ACEOF + +fi + + +if test z`which catppt 2>/dev/null` != "z"; then + catppt=`which catppt` + +cat >>confdefs.h <<_ACEOF +#define HAVE_CATPPT "$catppt" +_ACEOF + +fi + + +if test z`which xls2csv 2>/dev/null` != "z"; then + xls2csv=`which xls2csv` + +cat >>confdefs.h <<_ACEOF +#define HAVE_XLS2CSV "$xls2csv" +_ACEOF + +fi + + +if test z`which odt2txt 2>/dev/null` != "z"; then + odt2txt=`which odt2txt` + +cat >>confdefs.h <<_ACEOF +#define HAVE_ODT2TXT "$odt2txt" +_ACEOF + +fi + + echo "pdftotext: $pdftotext" +echo "catdoc: $catdoc" +echo "catppt: $catppt" +echo "xls2csv: $xls2csv" +echo "odt2txt: $odt2txt" id -u $RUNNING_USER 2>/dev/null 1>/dev/null diff --git a/configure.in b/configure.in index 9bde0229..df673a3c 100644 --- a/configure.in +++ b/configure.in @@ -38,9 +38,14 @@ have_clamd="no" have_antivirus="no" have_mysql="no" have_tre="no" +have_zip="no" have_zlib="no" pdftotext="no" +catdoc="no" +catppt="no" +xls2csv="no" +odt2txt="no" have_static_build="no" @@ -120,6 +125,12 @@ fi +dnl libzip + +AC_CHECK_HEADERS(zip.h, have_zip=yes, echo "zip.h is not found") +AC_CHECK_LIB([zip],[main],[AC_CHECK_LIB(zip, zip_open, have_zip=yes, echo "libzip.so is not found"; have_zip=no)],[],[])ac_cv_lib_zip=ac_cv_lib_zip_main + + dnl zlib AC_CHECK_HEADERS(zlib.h, have_zlib=yes, echo "zlib.h is not found") @@ -266,6 +277,12 @@ if test "$have_tre" = "yes"; then antispam_libs="$antispam_libs -ltre" fi +if test "$have_zip" = "yes"; then + echo "zip library: yes" + AC_DEFINE_UNQUOTED(HAVE_ZIP, 1, [libzip support]) + antispam_libs="$antispam_libs -lzip" +fi + if test "$have_mysql" = "yes"; then defs="$defs -DNEED_MYSQL" fi @@ -281,7 +298,35 @@ if test z`which pdftotext 2>/dev/null` != "z"; then fi +if test z`which catdoc 2>/dev/null` != "z"; then + catdoc=`which catdoc` + AC_DEFINE_UNQUOTED(HAVE_CATDOC, "$catdoc", [path to catdoc]) +fi + + +if test z`which catppt 2>/dev/null` != "z"; then + catppt=`which catppt` + AC_DEFINE_UNQUOTED(HAVE_CATPPT, "$catppt", [path to catppt]) +fi + + +if test z`which xls2csv 2>/dev/null` != "z"; then + xls2csv=`which xls2csv` + AC_DEFINE_UNQUOTED(HAVE_XLS2CSV, "$xls2csv", [path to xls2csv]) +fi + + +if test z`which odt2txt 2>/dev/null` != "z"; then + odt2txt=`which odt2txt` + AC_DEFINE_UNQUOTED(HAVE_ODT2TXT, "$odt2txt", [path to odt2txt]) +fi + + echo "pdftotext: $pdftotext" +echo "catdoc: $catdoc" +echo "catppt: $catppt" +echo "xls2csv: $xls2csv" +echo "odt2txt: $odt2txt" id -u $RUNNING_USER 2>/dev/null 1>/dev/null diff --git a/piler-config.h.in b/piler-config.h.in index cedace2f..4e13113d 100644 --- a/piler-config.h.in +++ b/piler-config.h.in @@ -10,3 +10,8 @@ #define HAVE_DAEMON 1 #undef HAVE_PDFTOTEXT +#undef HAVE_CATDOC +#undef HAVE_CATPPT +#undef HAVE_XLS2CSV +#undef HAVE_ZIP + diff --git a/src/defs.h b/src/defs.h index 0655bcdc..6db3e3e1 100644 --- a/src/defs.h +++ b/src/defs.h @@ -68,6 +68,7 @@ struct child { struct attachment { int size; char type[TINYBUFSIZE]; + char shorttype[TINYBUFSIZE]; char aname[TINYBUFSIZE]; char filename[TINYBUFSIZE]; char internalname[TINYBUFSIZE]; diff --git a/src/extract.c b/src/extract.c index f004887e..3bda0b82 100644 --- a/src/extract.c +++ b/src/extract.c @@ -4,15 +4,91 @@ #include #include +#ifdef HAVE_ZIP + #include +#endif -void extract_pdf(struct session_data *sdata, struct _state *state, char *filename, struct __config *cfg){ + +void remove_xml(char *buf, int *html){ + int i=0; + char *p; + + p = buf; + + for(; *p; p++){ + if(*p == '<'){ *html = 1; } + + if(*html == 0){ + *(buf+i) = *p; + i++; + } + + if(*p == '>'){ + *html = 0; + + if(i > 2 && *(buf+i-1) != ' '){ + *(buf+i) = ' '; i++; + } + + } + + } + + *(buf+i) = '\0'; +} + + +int extract_opendocument(struct session_data *sdata, struct _state *state, char *filename, char *prefix){ + int errorp, i=0, len=0, html=0; + char buf[MAXBUFSIZE]; + struct zip *z; + struct zip_stat sb; + struct zip_file *zf; + + z = zip_open(filename, 0, &errorp); + if(!z) return 1; + + memset(buf, 0, sizeof(buf)); + + while(zip_stat_index(z, i, 0, &sb) == 0){ + if(strncmp(sb.name, prefix, strlen(prefix)) == 0){ + + zf = zip_fopen_index(z, i, 0); + if(zf){ + while((len = zip_fread(zf, buf, sizeof(buf))) > 0){ + + remove_xml(buf, &html); + len = strlen(buf); + + if(state->bodylen < BIGBUFSIZE-len-1){ + memcpy(&(state->b_body[state->bodylen]), buf, len); + state->bodylen += len; + } + + memset(buf, 0, sizeof(buf)); + } + zip_fclose(zf); + } + + if(state->bodylen > BIGBUFSIZE-1024) break; + } + + i++; + } + + + zip_close(z); + + return 0; +} + + +void read_content_with_popen(struct session_data *sdata, struct _state *state, char *cmd){ int len; char buf[MAXBUFSIZE]; FILE *f; - snprintf(buf, sizeof(buf)-1, "%s -enc UTF-8 %s -", HAVE_PDFTOTEXT, filename); - - f = popen(buf, "r"); + f = popen(cmd, "r"); if(f){ while(fgets(buf, sizeof(buf)-1, f)){ len = strlen(buf); @@ -31,3 +107,62 @@ void extract_pdf(struct session_data *sdata, struct _state *state, char *filenam } +void extract_attachment_content(struct session_data *sdata, struct _state *state, char *filename, char *type, int *rec){ + char cmd[SMALLBUFSIZE]; + + if(strcmp(type, "other") == 0) return; + + memset(cmd, 0, sizeof(cmd)); + +#ifdef HAVE_PDFTOTEXT + if(strcmp(type, "pdf") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -enc UTF-8 %s -", HAVE_PDFTOTEXT, filename); +#endif + +#ifdef HAVE_CATDOC + if(strcmp(type, "doc") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_CATDOC, filename); +#endif + +#ifdef HAVE_CATPPT + if(strcmp(type, "ppt") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_CATPPT, filename); +#endif + +#ifdef HAVE_XLS2CSV + if(strcmp(type, "xls") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_XLS2CSV, filename); +#endif + + if(strlen(cmd) > 12){ + read_content_with_popen(sdata, state, cmd); + return; + } + + +#ifdef HAVE_ZIP + if(strcmp(type, "odf") == 0){ + extract_opendocument(sdata, state, filename, "content.xml"); + return; + } + + if(strcmp(type, "docx") == 0){ + extract_opendocument(sdata, state, filename, "word/document.xml"); + return; + } + + if(strcmp(type, "xlsx") == 0){ + extract_opendocument(sdata, state, filename, "xl/worksheets/sheet"); + return; + } + + if(strcmp(type, "pptx") == 0){ + extract_opendocument(sdata, state, filename, "ppt/slides/slide"); + return; + } + + if(strcmp(type, "zip") == 0 && *rec == 0){ + (*rec)++; + + } +#endif + +} + + diff --git a/src/extract.h b/src/extract.h index d7e4d73d..d184a320 100644 --- a/src/extract.h +++ b/src/extract.h @@ -1,7 +1,9 @@ #ifndef _EXTRACT_H #define _EXTRACT_H -void extract_pdf(struct session_data *sdata, struct _state *state, char *filename, struct __config *cfg); +#include "defs.h" + +void extract_attachment_content(struct session_data *sdata, struct _state *state, char *filename, char *type, int *rec); #endif /* _EXTRACT_H */ diff --git a/src/parser.c b/src/parser.c index acd07752..9a1d9da7 100644 --- a/src/parser.c +++ b/src/parser.c @@ -100,7 +100,7 @@ struct _state parse_message(struct session_data *sdata, int take_into_pieces, st void post_parse(struct session_data *sdata, struct _state *state, struct __config *cfg){ - int i, len; + int i, len, rec=0; char *p; free_list(state->boundaries); @@ -127,18 +127,12 @@ void post_parse(struct session_data *sdata, struct _state *state, struct __confi p = determine_attachment_type(state->attachments[i].filename, state->attachments[i].type); len = strlen(p); - if(strlen(sdata->attachments) < SMALLBUFSIZE-len-1 && !strstr(sdata->attachments, p)) memcpy(&(sdata->attachments[strlen(sdata->attachments)]), p, len); if(state->attachments[i].dumped == 1){ - - #ifdef HAVE_PDFTOTEXT - if( - strcmp(p, "pdf,") == 0 || - (strcmp(p, "other,") == 0 && strcasestr(state->attachments[i].filename, ".pdf")) - ) extract_pdf(sdata, state, state->attachments[i].aname, cfg); - #endif - + rec = 0; + if(state->bodylen < BIGBUFSIZE-1024) extract_attachment_content(sdata, state, state->attachments[i].aname, get_attachment_extractor_by_filename(state->attachments[i].filename), &rec); + unlink(state->attachments[i].aname); } @@ -245,22 +239,29 @@ int parse_line(char *buf, struct _state *state, struct session_data *sdata, int if(take_into_pieces == 1){ state->fd = open(state->attachments[state->n_attachments].internalname, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR); - p = determine_attachment_type(state->attachments[state->n_attachments].filename, state->attachments[state->n_attachments].type); - - if(strcmp("pdf,", p) == 0 || strcmp("other,", p) == 0){ + p = get_attachment_extractor_by_filename(state->attachments[state->n_attachments].filename); + + snprintf(state->attachments[state->n_attachments].shorttype, TINYBUFSIZE-1, "%s", p); + + if(strcmp("other", p)){ state->b64fd = open(state->attachments[state->n_attachments].aname, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR); state->attachments[state->n_attachments].dumped = 1; } + + if(state->fd == -1){ state->attachments[state->n_attachments].size = 0; state->attachments[state->n_attachments].dumped = 0; memset(state->attachments[state->n_attachments].type, 0, TINYBUFSIZE); + memset(state->attachments[state->n_attachments].shorttype, 0, TINYBUFSIZE); + memset(state->attachments[state->n_attachments].aname, 0, TINYBUFSIZE); memset(state->attachments[state->n_attachments].filename, 0, TINYBUFSIZE); memset(state->attachments[state->n_attachments].internalname, 0, TINYBUFSIZE); memset(state->attachments[state->n_attachments].digest, 0, 2*DIGEST_LENGTH+1); + syslog(LOG_PRIORITY, "%s: error opening %s", sdata->ttmpfile, state->attachments[state->n_attachments].internalname); state->n_attachments--; diff --git a/src/parser.h b/src/parser.h index 19d70b41..cbce2cbe 100644 --- a/src/parser.h +++ b/src/parser.h @@ -31,6 +31,7 @@ void degenerateToken(unsigned char *p); void fixURL(char *url); int extractNameFromHeaderLine(char *s, char *name, char *resultbuf); char *determine_attachment_type(char *filename, char *type); +char *get_attachment_extractor_by_filename(char *filename); void parse_reference(struct _state *state, char *s); int base64_decode_attachment_buffer(char *p, int plen, unsigned char *b, int blen); diff --git a/src/parser_utils.c b/src/parser_utils.c index efd64324..f712f8d5 100644 --- a/src/parser_utils.c +++ b/src/parser_utils.c @@ -76,6 +76,7 @@ void init_state(struct _state *state){ state->attachments[i].size = 0; state->attachments[i].dumped = 0; memset(state->attachments[i].type, 0, TINYBUFSIZE); + memset(state->attachments[i].shorttype, 0, TINYBUFSIZE); memset(state->attachments[i].aname, 0, TINYBUFSIZE); memset(state->attachments[i].filename, 0, TINYBUFSIZE); memset(state->attachments[i].internalname, 0, TINYBUFSIZE); @@ -697,23 +698,40 @@ char *determine_attachment_type(char *filename, char *type){ if(strncasecmp(type, "application/pdf", strlen("application/pdf")) == 0) return "pdf,"; + if(strncasecmp(type, "application/ms-tnef", strlen("application/ms-tnef")) == 0) return "winmail,"; if(strncasecmp(type, "application/msword", strlen("application/msword")) == 0) return "word,"; + + // a .csv file has the same type if(strncasecmp(type, "application/vnd.ms-excel", strlen("application/vnd.ms-excel")) == 0) return "excel,"; + if(strncasecmp(type, "application/vnd.ms-powerpoint", strlen("application/vnd.ms-powerpoint")) == 0) return "powerpoint,"; + if(strncasecmp(type, "application/vnd.visio", strlen("application/vnd.visio")) == 0) return "visio,"; + + if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", strlen("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) == 0) return "word,"; + if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", strlen("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) == 0) return "excel,"; + if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.presentationml.presentation", strlen("application/vnd.openxmlformats-officedocument.presentationml.presentation")) == 0) return "powerpoint,"; + if(strncasecmp(type, "application/x-shockwave-flash", strlen("application/x-shockwave-flash")) == 0) return "flash,"; if(strcasestr(type, "opendocument")) return "odf,"; - if(strcasecmp(type, "application/octet-stream") == 0){ - p = strrchr(type, '.'); + + if(strncasecmp(type, "application/", 12) == 0){ + + p = strrchr(filename, '.'); if(p){ p++; + if(strncasecmp(p, "pdf", 3) == 0) return "pdf,"; + if(strncasecmp(p, "zip", 3) == 0) return "compressed,"; if(strncasecmp(p, "rar", 3) == 0) return "compressed,"; + // tar.gz has the same type + if(strncasecmp(p, "x-gzip", 3) == 0) return "compressed,"; + if(strncasecmp(p, "doc", 3) == 0) return "word,"; if(strncasecmp(p, "docx", 4) == 0) return "word,"; if(strncasecmp(p, "xls", 3) == 0) return "excel,"; @@ -733,6 +751,32 @@ char *determine_attachment_type(char *filename, char *type){ } +char *get_attachment_extractor_by_filename(char *filename){ + char *p; + + p = strrchr(filename, '.'); + if(!p) return "other"; + + if(strcasecmp(p, ".pdf") == 0) return "pdf"; + if(strcasecmp(p, ".zip") == 0) return "zip"; + if(strcasecmp(p, ".gz") == 0) return "gzip"; + if(strcasecmp(p, ".rar") == 0) return "rar"; + if(strcasecmp(p, ".odt") == 0) return "odf"; + if(strcasecmp(p, ".odp") == 0) return "odf"; + if(strcasecmp(p, ".ods") == 0) return "odf"; + if(strcasecmp(p, ".doc") == 0) return "doc"; + if(strcasecmp(p, ".docx") == 0) return "docx"; + if(strcasecmp(p, ".xls") == 0) return "xls"; + if(strcasecmp(p, ".xlsx") == 0) return "xlsx"; + if(strcasecmp(p, ".ppt") == 0) return "ppt"; + if(strcasecmp(p, ".pptx") == 0) return "pptx"; + if(strcasecmp(p, ".txt") == 0) return "text"; + if(strcasecmp(p, ".csv") == 0) return "text"; + + return "other"; +} + + void parse_reference(struct _state *state, char *s){ int len; char puf[SMALLBUFSIZE]; diff --git a/src/test.c b/src/test.c index 3a660c55..7834c016 100644 --- a/src/test.c +++ b/src/test.c @@ -72,7 +72,7 @@ int main(int argc, char **argv){ snprintf(sdata.filename, SMALLBUFSIZE-1, "%s", argv[1]); snprintf(sdata.tmpframe, SMALLBUFSIZE-1, "%s.m", argv[1]); - state = parse_message(&sdata, 0, &cfg); + state = parse_message(&sdata, 1, &cfg); post_parse(&sdata, &state, &cfg); printf("message-id: %s\n", state.message_id); @@ -80,7 +80,7 @@ int main(int argc, char **argv){ printf("to: *%s (%s)*\n", state.b_to, state.b_to_domain); printf("reference: *%s*\n", state.reference); printf("subject: *%s*\n", state.b_subject); - //printf("body: *%s*\n", state.b_body); + printf("body: *%s*\n", state.b_body); printf("sent: %ld\n", sdata.sent); @@ -103,8 +103,11 @@ int main(int argc, char **argv){ for(i=1; i<=state.n_attachments; i++){ printf("i:%d, name=*%s*, type: *%s*, size: %d, int.name: %s, digest: %s\n", i, state.attachments[i].filename, state.attachments[i].type, state.attachments[i].size, state.attachments[i].internalname, state.attachments[i].digest); + unlink(state.attachments[i].internalname); } + unlink(sdata.tmpframe); + printf("attachments:%s\n", sdata.attachments); printf("direction: %d\n", sdata.direction);