added odf, ms office attachment support

This commit is contained in:
SJ 2012-09-09 23:16:09 +02:00
parent 1620f0b50f
commit b4854e312a
10 changed files with 410 additions and 22 deletions

151
configure vendored
View File

@ -3412,9 +3412,14 @@ have_clamd="no"
have_antivirus="no"
have_mysql="no"
have_tre="no"
have_zip="no"
have_zlib="no"
pdftotext="no"
catdoc="no"
catppt="no"
xls2csv="no"
odt2txt="no"
have_static_build="no"
@ -3782,6 +3787,98 @@ fi
for ac_header in zip.h
do :
ac_fn_c_check_header_mongrel "$LINENO" "zip.h" "ac_cv_header_zip_h" "$ac_includes_default"
if test "x$ac_cv_header_zip_h" = xyes; then :
cat >>confdefs.h <<_ACEOF
#define HAVE_ZIP_H 1
_ACEOF
have_zip=yes
else
echo "zip.h is not found"
fi
done
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for main in -lzip" >&5
$as_echo_n "checking for main in -lzip... " >&6; }
if ${ac_cv_lib_zip_main+:} false; then :
$as_echo_n "(cached) " >&6
else
ac_check_lib_save_LIBS=$LIBS
LIBS="-lzip $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
int
main ()
{
return main ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
ac_cv_lib_zip_main=yes
else
ac_cv_lib_zip_main=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_zip_main" >&5
$as_echo "$ac_cv_lib_zip_main" >&6; }
if test "x$ac_cv_lib_zip_main" = xyes; then :
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for zip_open in -lzip" >&5
$as_echo_n "checking for zip_open in -lzip... " >&6; }
if ${ac_cv_lib_zip_zip_open+:} false; then :
$as_echo_n "(cached) " >&6
else
ac_check_lib_save_LIBS=$LIBS
LIBS="-lzip $LIBS"
cat confdefs.h - <<_ACEOF >conftest.$ac_ext
/* end confdefs.h. */
/* Override any GCC internal prototype to avoid an error.
Use char because int might match the return type of a GCC
builtin and then its argument prototype would still apply. */
#ifdef __cplusplus
extern "C"
#endif
char zip_open ();
int
main ()
{
return zip_open ();
;
return 0;
}
_ACEOF
if ac_fn_c_try_link "$LINENO"; then :
ac_cv_lib_zip_zip_open=yes
else
ac_cv_lib_zip_zip_open=no
fi
rm -f core conftest.err conftest.$ac_objext \
conftest$ac_exeext conftest.$ac_ext
LIBS=$ac_check_lib_save_LIBS
fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_zip_zip_open" >&5
$as_echo "$ac_cv_lib_zip_zip_open" >&6; }
if test "x$ac_cv_lib_zip_zip_open" = xyes; then :
have_zip=yes
else
echo "libzip.so is not found"; have_zip=no
fi
fi
ac_cv_lib_zip=ac_cv_lib_zip_main
for ac_header in zlib.h
do :
ac_fn_c_check_header_mongrel "$LINENO" "zlib.h" "ac_cv_header_zlib_h" "$ac_includes_default"
@ -4144,6 +4241,16 @@ if test "$have_tre" = "yes"; then
antispam_libs="$antispam_libs -ltre"
fi
if test "$have_zip" = "yes"; then
echo "zip library: yes"
cat >>confdefs.h <<_ACEOF
#define HAVE_ZIP "1"
_ACEOF
antispam_libs="$antispam_libs -lzip"
fi
if test "$have_mysql" = "yes"; then
defs="$defs -DNEED_MYSQL"
fi
@ -4163,7 +4270,51 @@ _ACEOF
fi
if test z`which catdoc 2>/dev/null` != "z"; then
catdoc=`which catdoc`
cat >>confdefs.h <<_ACEOF
#define HAVE_CATDOC "$catdoc"
_ACEOF
fi
if test z`which catppt 2>/dev/null` != "z"; then
catppt=`which catppt`
cat >>confdefs.h <<_ACEOF
#define HAVE_CATPPT "$catppt"
_ACEOF
fi
if test z`which xls2csv 2>/dev/null` != "z"; then
xls2csv=`which xls2csv`
cat >>confdefs.h <<_ACEOF
#define HAVE_XLS2CSV "$xls2csv"
_ACEOF
fi
if test z`which odt2txt 2>/dev/null` != "z"; then
odt2txt=`which odt2txt`
cat >>confdefs.h <<_ACEOF
#define HAVE_ODT2TXT "$odt2txt"
_ACEOF
fi
echo "pdftotext: $pdftotext"
echo "catdoc: $catdoc"
echo "catppt: $catppt"
echo "xls2csv: $xls2csv"
echo "odt2txt: $odt2txt"
id -u $RUNNING_USER 2>/dev/null 1>/dev/null

View File

@ -38,9 +38,14 @@ have_clamd="no"
have_antivirus="no"
have_mysql="no"
have_tre="no"
have_zip="no"
have_zlib="no"
pdftotext="no"
catdoc="no"
catppt="no"
xls2csv="no"
odt2txt="no"
have_static_build="no"
@ -120,6 +125,12 @@ fi
dnl libzip
AC_CHECK_HEADERS(zip.h, have_zip=yes, echo "zip.h is not found")
AC_CHECK_LIB([zip],[main],[AC_CHECK_LIB(zip, zip_open, have_zip=yes, echo "libzip.so is not found"; have_zip=no)],[],[])ac_cv_lib_zip=ac_cv_lib_zip_main
dnl zlib
AC_CHECK_HEADERS(zlib.h, have_zlib=yes, echo "zlib.h is not found")
@ -266,6 +277,12 @@ if test "$have_tre" = "yes"; then
antispam_libs="$antispam_libs -ltre"
fi
if test "$have_zip" = "yes"; then
echo "zip library: yes"
AC_DEFINE_UNQUOTED(HAVE_ZIP, 1, [libzip support])
antispam_libs="$antispam_libs -lzip"
fi
if test "$have_mysql" = "yes"; then
defs="$defs -DNEED_MYSQL"
fi
@ -281,7 +298,35 @@ if test z`which pdftotext 2>/dev/null` != "z"; then
fi
if test z`which catdoc 2>/dev/null` != "z"; then
catdoc=`which catdoc`
AC_DEFINE_UNQUOTED(HAVE_CATDOC, "$catdoc", [path to catdoc])
fi
if test z`which catppt 2>/dev/null` != "z"; then
catppt=`which catppt`
AC_DEFINE_UNQUOTED(HAVE_CATPPT, "$catppt", [path to catppt])
fi
if test z`which xls2csv 2>/dev/null` != "z"; then
xls2csv=`which xls2csv`
AC_DEFINE_UNQUOTED(HAVE_XLS2CSV, "$xls2csv", [path to xls2csv])
fi
if test z`which odt2txt 2>/dev/null` != "z"; then
odt2txt=`which odt2txt`
AC_DEFINE_UNQUOTED(HAVE_ODT2TXT, "$odt2txt", [path to odt2txt])
fi
echo "pdftotext: $pdftotext"
echo "catdoc: $catdoc"
echo "catppt: $catppt"
echo "xls2csv: $xls2csv"
echo "odt2txt: $odt2txt"
id -u $RUNNING_USER 2>/dev/null 1>/dev/null

View File

@ -10,3 +10,8 @@
#define HAVE_DAEMON 1
#undef HAVE_PDFTOTEXT
#undef HAVE_CATDOC
#undef HAVE_CATPPT
#undef HAVE_XLS2CSV
#undef HAVE_ZIP

View File

@ -68,6 +68,7 @@ struct child {
struct attachment {
int size;
char type[TINYBUFSIZE];
char shorttype[TINYBUFSIZE];
char aname[TINYBUFSIZE];
char filename[TINYBUFSIZE];
char internalname[TINYBUFSIZE];

View File

@ -4,15 +4,91 @@
#include <unistd.h>
#include <piler.h>
#ifdef HAVE_ZIP
#include <zip.h>
#endif
void extract_pdf(struct session_data *sdata, struct _state *state, char *filename, struct __config *cfg){
void remove_xml(char *buf, int *html){
int i=0;
char *p;
p = buf;
for(; *p; p++){
if(*p == '<'){ *html = 1; }
if(*html == 0){
*(buf+i) = *p;
i++;
}
if(*p == '>'){
*html = 0;
if(i > 2 && *(buf+i-1) != ' '){
*(buf+i) = ' '; i++;
}
}
}
*(buf+i) = '\0';
}
int extract_opendocument(struct session_data *sdata, struct _state *state, char *filename, char *prefix){
int errorp, i=0, len=0, html=0;
char buf[MAXBUFSIZE];
struct zip *z;
struct zip_stat sb;
struct zip_file *zf;
z = zip_open(filename, 0, &errorp);
if(!z) return 1;
memset(buf, 0, sizeof(buf));
while(zip_stat_index(z, i, 0, &sb) == 0){
if(strncmp(sb.name, prefix, strlen(prefix)) == 0){
zf = zip_fopen_index(z, i, 0);
if(zf){
while((len = zip_fread(zf, buf, sizeof(buf))) > 0){
remove_xml(buf, &html);
len = strlen(buf);
if(state->bodylen < BIGBUFSIZE-len-1){
memcpy(&(state->b_body[state->bodylen]), buf, len);
state->bodylen += len;
}
memset(buf, 0, sizeof(buf));
}
zip_fclose(zf);
}
if(state->bodylen > BIGBUFSIZE-1024) break;
}
i++;
}
zip_close(z);
return 0;
}
void read_content_with_popen(struct session_data *sdata, struct _state *state, char *cmd){
int len;
char buf[MAXBUFSIZE];
FILE *f;
snprintf(buf, sizeof(buf)-1, "%s -enc UTF-8 %s -", HAVE_PDFTOTEXT, filename);
f = popen(buf, "r");
f = popen(cmd, "r");
if(f){
while(fgets(buf, sizeof(buf)-1, f)){
len = strlen(buf);
@ -31,3 +107,62 @@ void extract_pdf(struct session_data *sdata, struct _state *state, char *filenam
}
void extract_attachment_content(struct session_data *sdata, struct _state *state, char *filename, char *type, int *rec){
char cmd[SMALLBUFSIZE];
if(strcmp(type, "other") == 0) return;
memset(cmd, 0, sizeof(cmd));
#ifdef HAVE_PDFTOTEXT
if(strcmp(type, "pdf") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -enc UTF-8 %s -", HAVE_PDFTOTEXT, filename);
#endif
#ifdef HAVE_CATDOC
if(strcmp(type, "doc") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_CATDOC, filename);
#endif
#ifdef HAVE_CATPPT
if(strcmp(type, "ppt") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_CATPPT, filename);
#endif
#ifdef HAVE_XLS2CSV
if(strcmp(type, "xls") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_XLS2CSV, filename);
#endif
if(strlen(cmd) > 12){
read_content_with_popen(sdata, state, cmd);
return;
}
#ifdef HAVE_ZIP
if(strcmp(type, "odf") == 0){
extract_opendocument(sdata, state, filename, "content.xml");
return;
}
if(strcmp(type, "docx") == 0){
extract_opendocument(sdata, state, filename, "word/document.xml");
return;
}
if(strcmp(type, "xlsx") == 0){
extract_opendocument(sdata, state, filename, "xl/worksheets/sheet");
return;
}
if(strcmp(type, "pptx") == 0){
extract_opendocument(sdata, state, filename, "ppt/slides/slide");
return;
}
if(strcmp(type, "zip") == 0 && *rec == 0){
(*rec)++;
}
#endif
}

View File

@ -1,7 +1,9 @@
#ifndef _EXTRACT_H
#define _EXTRACT_H
void extract_pdf(struct session_data *sdata, struct _state *state, char *filename, struct __config *cfg);
#include "defs.h"
void extract_attachment_content(struct session_data *sdata, struct _state *state, char *filename, char *type, int *rec);
#endif /* _EXTRACT_H */

View File

@ -100,7 +100,7 @@ struct _state parse_message(struct session_data *sdata, int take_into_pieces, st
void post_parse(struct session_data *sdata, struct _state *state, struct __config *cfg){
int i, len;
int i, len, rec=0;
char *p;
free_list(state->boundaries);
@ -127,17 +127,11 @@ void post_parse(struct session_data *sdata, struct _state *state, struct __confi
p = determine_attachment_type(state->attachments[i].filename, state->attachments[i].type);
len = strlen(p);
if(strlen(sdata->attachments) < SMALLBUFSIZE-len-1 && !strstr(sdata->attachments, p)) memcpy(&(sdata->attachments[strlen(sdata->attachments)]), p, len);
if(state->attachments[i].dumped == 1){
#ifdef HAVE_PDFTOTEXT
if(
strcmp(p, "pdf,") == 0 ||
(strcmp(p, "other,") == 0 && strcasestr(state->attachments[i].filename, ".pdf"))
) extract_pdf(sdata, state, state->attachments[i].aname, cfg);
#endif
rec = 0;
if(state->bodylen < BIGBUFSIZE-1024) extract_attachment_content(sdata, state, state->attachments[i].aname, get_attachment_extractor_by_filename(state->attachments[i].filename), &rec);
unlink(state->attachments[i].aname);
}
@ -245,22 +239,29 @@ int parse_line(char *buf, struct _state *state, struct session_data *sdata, int
if(take_into_pieces == 1){
state->fd = open(state->attachments[state->n_attachments].internalname, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR);
p = determine_attachment_type(state->attachments[state->n_attachments].filename, state->attachments[state->n_attachments].type);
p = get_attachment_extractor_by_filename(state->attachments[state->n_attachments].filename);
if(strcmp("pdf,", p) == 0 || strcmp("other,", p) == 0){
snprintf(state->attachments[state->n_attachments].shorttype, TINYBUFSIZE-1, "%s", p);
if(strcmp("other", p)){
state->b64fd = open(state->attachments[state->n_attachments].aname, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR);
state->attachments[state->n_attachments].dumped = 1;
}
if(state->fd == -1){
state->attachments[state->n_attachments].size = 0;
state->attachments[state->n_attachments].dumped = 0;
memset(state->attachments[state->n_attachments].type, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].shorttype, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].aname, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].filename, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].internalname, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].digest, 0, 2*DIGEST_LENGTH+1);
syslog(LOG_PRIORITY, "%s: error opening %s", sdata->ttmpfile, state->attachments[state->n_attachments].internalname);
state->n_attachments--;

View File

@ -31,6 +31,7 @@ void degenerateToken(unsigned char *p);
void fixURL(char *url);
int extractNameFromHeaderLine(char *s, char *name, char *resultbuf);
char *determine_attachment_type(char *filename, char *type);
char *get_attachment_extractor_by_filename(char *filename);
void parse_reference(struct _state *state, char *s);
int base64_decode_attachment_buffer(char *p, int plen, unsigned char *b, int blen);

View File

@ -76,6 +76,7 @@ void init_state(struct _state *state){
state->attachments[i].size = 0;
state->attachments[i].dumped = 0;
memset(state->attachments[i].type, 0, TINYBUFSIZE);
memset(state->attachments[i].shorttype, 0, TINYBUFSIZE);
memset(state->attachments[i].aname, 0, TINYBUFSIZE);
memset(state->attachments[i].filename, 0, TINYBUFSIZE);
memset(state->attachments[i].internalname, 0, TINYBUFSIZE);
@ -697,23 +698,40 @@ char *determine_attachment_type(char *filename, char *type){
if(strncasecmp(type, "application/pdf", strlen("application/pdf")) == 0) return "pdf,";
if(strncasecmp(type, "application/ms-tnef", strlen("application/ms-tnef")) == 0) return "winmail,";
if(strncasecmp(type, "application/msword", strlen("application/msword")) == 0) return "word,";
// a .csv file has the same type
if(strncasecmp(type, "application/vnd.ms-excel", strlen("application/vnd.ms-excel")) == 0) return "excel,";
if(strncasecmp(type, "application/vnd.ms-powerpoint", strlen("application/vnd.ms-powerpoint")) == 0) return "powerpoint,";
if(strncasecmp(type, "application/vnd.visio", strlen("application/vnd.visio")) == 0) return "visio,";
if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", strlen("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) == 0) return "word,";
if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", strlen("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) == 0) return "excel,";
if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.presentationml.presentation", strlen("application/vnd.openxmlformats-officedocument.presentationml.presentation")) == 0) return "powerpoint,";
if(strncasecmp(type, "application/x-shockwave-flash", strlen("application/x-shockwave-flash")) == 0) return "flash,";
if(strcasestr(type, "opendocument")) return "odf,";
if(strcasecmp(type, "application/octet-stream") == 0){
p = strrchr(type, '.');
if(strncasecmp(type, "application/", 12) == 0){
p = strrchr(filename, '.');
if(p){
p++;
if(strncasecmp(p, "pdf", 3) == 0) return "pdf,";
if(strncasecmp(p, "zip", 3) == 0) return "compressed,";
if(strncasecmp(p, "rar", 3) == 0) return "compressed,";
// tar.gz has the same type
if(strncasecmp(p, "x-gzip", 3) == 0) return "compressed,";
if(strncasecmp(p, "doc", 3) == 0) return "word,";
if(strncasecmp(p, "docx", 4) == 0) return "word,";
if(strncasecmp(p, "xls", 3) == 0) return "excel,";
@ -733,6 +751,32 @@ char *determine_attachment_type(char *filename, char *type){
}
char *get_attachment_extractor_by_filename(char *filename){
char *p;
p = strrchr(filename, '.');
if(!p) return "other";
if(strcasecmp(p, ".pdf") == 0) return "pdf";
if(strcasecmp(p, ".zip") == 0) return "zip";
if(strcasecmp(p, ".gz") == 0) return "gzip";
if(strcasecmp(p, ".rar") == 0) return "rar";
if(strcasecmp(p, ".odt") == 0) return "odf";
if(strcasecmp(p, ".odp") == 0) return "odf";
if(strcasecmp(p, ".ods") == 0) return "odf";
if(strcasecmp(p, ".doc") == 0) return "doc";
if(strcasecmp(p, ".docx") == 0) return "docx";
if(strcasecmp(p, ".xls") == 0) return "xls";
if(strcasecmp(p, ".xlsx") == 0) return "xlsx";
if(strcasecmp(p, ".ppt") == 0) return "ppt";
if(strcasecmp(p, ".pptx") == 0) return "pptx";
if(strcasecmp(p, ".txt") == 0) return "text";
if(strcasecmp(p, ".csv") == 0) return "text";
return "other";
}
void parse_reference(struct _state *state, char *s){
int len;
char puf[SMALLBUFSIZE];

View File

@ -72,7 +72,7 @@ int main(int argc, char **argv){
snprintf(sdata.filename, SMALLBUFSIZE-1, "%s", argv[1]);
snprintf(sdata.tmpframe, SMALLBUFSIZE-1, "%s.m", argv[1]);
state = parse_message(&sdata, 0, &cfg);
state = parse_message(&sdata, 1, &cfg);
post_parse(&sdata, &state, &cfg);
printf("message-id: %s\n", state.message_id);
@ -80,7 +80,7 @@ int main(int argc, char **argv){
printf("to: *%s (%s)*\n", state.b_to, state.b_to_domain);
printf("reference: *%s*\n", state.reference);
printf("subject: *%s*\n", state.b_subject);
//printf("body: *%s*\n", state.b_body);
printf("body: *%s*\n", state.b_body);
printf("sent: %ld\n", sdata.sent);
@ -103,8 +103,11 @@ int main(int argc, char **argv){
for(i=1; i<=state.n_attachments; i++){
printf("i:%d, name=*%s*, type: *%s*, size: %d, int.name: %s, digest: %s\n", i, state.attachments[i].filename, state.attachments[i].type, state.attachments[i].size, state.attachments[i].internalname, state.attachments[i].digest);
unlink(state.attachments[i].internalname);
}
unlink(sdata.tmpframe);
printf("attachments:%s\n", sdata.attachments);
printf("direction: %d\n", sdata.direction);