added odf, ms office attachment support

This commit is contained in:
SJ
2012-09-09 23:16:09 +02:00
parent 1620f0b50f
commit b4854e312a
10 changed files with 410 additions and 22 deletions

View File

@ -68,6 +68,7 @@ struct child {
struct attachment {
int size;
char type[TINYBUFSIZE];
char shorttype[TINYBUFSIZE];
char aname[TINYBUFSIZE];
char filename[TINYBUFSIZE];
char internalname[TINYBUFSIZE];

View File

@ -4,15 +4,91 @@
#include <unistd.h>
#include <piler.h>
#ifdef HAVE_ZIP
#include <zip.h>
#endif
void extract_pdf(struct session_data *sdata, struct _state *state, char *filename, struct __config *cfg){
void remove_xml(char *buf, int *html){
int i=0;
char *p;
p = buf;
for(; *p; p++){
if(*p == '<'){ *html = 1; }
if(*html == 0){
*(buf+i) = *p;
i++;
}
if(*p == '>'){
*html = 0;
if(i > 2 && *(buf+i-1) != ' '){
*(buf+i) = ' '; i++;
}
}
}
*(buf+i) = '\0';
}
int extract_opendocument(struct session_data *sdata, struct _state *state, char *filename, char *prefix){
int errorp, i=0, len=0, html=0;
char buf[MAXBUFSIZE];
struct zip *z;
struct zip_stat sb;
struct zip_file *zf;
z = zip_open(filename, 0, &errorp);
if(!z) return 1;
memset(buf, 0, sizeof(buf));
while(zip_stat_index(z, i, 0, &sb) == 0){
if(strncmp(sb.name, prefix, strlen(prefix)) == 0){
zf = zip_fopen_index(z, i, 0);
if(zf){
while((len = zip_fread(zf, buf, sizeof(buf))) > 0){
remove_xml(buf, &html);
len = strlen(buf);
if(state->bodylen < BIGBUFSIZE-len-1){
memcpy(&(state->b_body[state->bodylen]), buf, len);
state->bodylen += len;
}
memset(buf, 0, sizeof(buf));
}
zip_fclose(zf);
}
if(state->bodylen > BIGBUFSIZE-1024) break;
}
i++;
}
zip_close(z);
return 0;
}
void read_content_with_popen(struct session_data *sdata, struct _state *state, char *cmd){
int len;
char buf[MAXBUFSIZE];
FILE *f;
snprintf(buf, sizeof(buf)-1, "%s -enc UTF-8 %s -", HAVE_PDFTOTEXT, filename);
f = popen(buf, "r");
f = popen(cmd, "r");
if(f){
while(fgets(buf, sizeof(buf)-1, f)){
len = strlen(buf);
@ -31,3 +107,62 @@ void extract_pdf(struct session_data *sdata, struct _state *state, char *filenam
}
void extract_attachment_content(struct session_data *sdata, struct _state *state, char *filename, char *type, int *rec){
char cmd[SMALLBUFSIZE];
if(strcmp(type, "other") == 0) return;
memset(cmd, 0, sizeof(cmd));
#ifdef HAVE_PDFTOTEXT
if(strcmp(type, "pdf") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -enc UTF-8 %s -", HAVE_PDFTOTEXT, filename);
#endif
#ifdef HAVE_CATDOC
if(strcmp(type, "doc") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_CATDOC, filename);
#endif
#ifdef HAVE_CATPPT
if(strcmp(type, "ppt") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_CATPPT, filename);
#endif
#ifdef HAVE_XLS2CSV
if(strcmp(type, "xls") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_XLS2CSV, filename);
#endif
if(strlen(cmd) > 12){
read_content_with_popen(sdata, state, cmd);
return;
}
#ifdef HAVE_ZIP
if(strcmp(type, "odf") == 0){
extract_opendocument(sdata, state, filename, "content.xml");
return;
}
if(strcmp(type, "docx") == 0){
extract_opendocument(sdata, state, filename, "word/document.xml");
return;
}
if(strcmp(type, "xlsx") == 0){
extract_opendocument(sdata, state, filename, "xl/worksheets/sheet");
return;
}
if(strcmp(type, "pptx") == 0){
extract_opendocument(sdata, state, filename, "ppt/slides/slide");
return;
}
if(strcmp(type, "zip") == 0 && *rec == 0){
(*rec)++;
}
#endif
}

View File

@ -1,7 +1,9 @@
#ifndef _EXTRACT_H
#define _EXTRACT_H
void extract_pdf(struct session_data *sdata, struct _state *state, char *filename, struct __config *cfg);
#include "defs.h"
void extract_attachment_content(struct session_data *sdata, struct _state *state, char *filename, char *type, int *rec);
#endif /* _EXTRACT_H */

View File

@ -100,7 +100,7 @@ struct _state parse_message(struct session_data *sdata, int take_into_pieces, st
void post_parse(struct session_data *sdata, struct _state *state, struct __config *cfg){
int i, len;
int i, len, rec=0;
char *p;
free_list(state->boundaries);
@ -127,18 +127,12 @@ void post_parse(struct session_data *sdata, struct _state *state, struct __confi
p = determine_attachment_type(state->attachments[i].filename, state->attachments[i].type);
len = strlen(p);
if(strlen(sdata->attachments) < SMALLBUFSIZE-len-1 && !strstr(sdata->attachments, p)) memcpy(&(sdata->attachments[strlen(sdata->attachments)]), p, len);
if(state->attachments[i].dumped == 1){
#ifdef HAVE_PDFTOTEXT
if(
strcmp(p, "pdf,") == 0 ||
(strcmp(p, "other,") == 0 && strcasestr(state->attachments[i].filename, ".pdf"))
) extract_pdf(sdata, state, state->attachments[i].aname, cfg);
#endif
rec = 0;
if(state->bodylen < BIGBUFSIZE-1024) extract_attachment_content(sdata, state, state->attachments[i].aname, get_attachment_extractor_by_filename(state->attachments[i].filename), &rec);
unlink(state->attachments[i].aname);
}
@ -245,22 +239,29 @@ int parse_line(char *buf, struct _state *state, struct session_data *sdata, int
if(take_into_pieces == 1){
state->fd = open(state->attachments[state->n_attachments].internalname, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR);
p = determine_attachment_type(state->attachments[state->n_attachments].filename, state->attachments[state->n_attachments].type);
if(strcmp("pdf,", p) == 0 || strcmp("other,", p) == 0){
p = get_attachment_extractor_by_filename(state->attachments[state->n_attachments].filename);
snprintf(state->attachments[state->n_attachments].shorttype, TINYBUFSIZE-1, "%s", p);
if(strcmp("other", p)){
state->b64fd = open(state->attachments[state->n_attachments].aname, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR);
state->attachments[state->n_attachments].dumped = 1;
}
if(state->fd == -1){
state->attachments[state->n_attachments].size = 0;
state->attachments[state->n_attachments].dumped = 0;
memset(state->attachments[state->n_attachments].type, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].shorttype, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].aname, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].filename, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].internalname, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].digest, 0, 2*DIGEST_LENGTH+1);
syslog(LOG_PRIORITY, "%s: error opening %s", sdata->ttmpfile, state->attachments[state->n_attachments].internalname);
state->n_attachments--;

View File

@ -31,6 +31,7 @@ void degenerateToken(unsigned char *p);
void fixURL(char *url);
int extractNameFromHeaderLine(char *s, char *name, char *resultbuf);
char *determine_attachment_type(char *filename, char *type);
char *get_attachment_extractor_by_filename(char *filename);
void parse_reference(struct _state *state, char *s);
int base64_decode_attachment_buffer(char *p, int plen, unsigned char *b, int blen);

View File

@ -76,6 +76,7 @@ void init_state(struct _state *state){
state->attachments[i].size = 0;
state->attachments[i].dumped = 0;
memset(state->attachments[i].type, 0, TINYBUFSIZE);
memset(state->attachments[i].shorttype, 0, TINYBUFSIZE);
memset(state->attachments[i].aname, 0, TINYBUFSIZE);
memset(state->attachments[i].filename, 0, TINYBUFSIZE);
memset(state->attachments[i].internalname, 0, TINYBUFSIZE);
@ -697,23 +698,40 @@ char *determine_attachment_type(char *filename, char *type){
if(strncasecmp(type, "application/pdf", strlen("application/pdf")) == 0) return "pdf,";
if(strncasecmp(type, "application/ms-tnef", strlen("application/ms-tnef")) == 0) return "winmail,";
if(strncasecmp(type, "application/msword", strlen("application/msword")) == 0) return "word,";
// a .csv file has the same type
if(strncasecmp(type, "application/vnd.ms-excel", strlen("application/vnd.ms-excel")) == 0) return "excel,";
if(strncasecmp(type, "application/vnd.ms-powerpoint", strlen("application/vnd.ms-powerpoint")) == 0) return "powerpoint,";
if(strncasecmp(type, "application/vnd.visio", strlen("application/vnd.visio")) == 0) return "visio,";
if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", strlen("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) == 0) return "word,";
if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", strlen("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) == 0) return "excel,";
if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.presentationml.presentation", strlen("application/vnd.openxmlformats-officedocument.presentationml.presentation")) == 0) return "powerpoint,";
if(strncasecmp(type, "application/x-shockwave-flash", strlen("application/x-shockwave-flash")) == 0) return "flash,";
if(strcasestr(type, "opendocument")) return "odf,";
if(strcasecmp(type, "application/octet-stream") == 0){
p = strrchr(type, '.');
if(strncasecmp(type, "application/", 12) == 0){
p = strrchr(filename, '.');
if(p){
p++;
if(strncasecmp(p, "pdf", 3) == 0) return "pdf,";
if(strncasecmp(p, "zip", 3) == 0) return "compressed,";
if(strncasecmp(p, "rar", 3) == 0) return "compressed,";
// tar.gz has the same type
if(strncasecmp(p, "x-gzip", 3) == 0) return "compressed,";
if(strncasecmp(p, "doc", 3) == 0) return "word,";
if(strncasecmp(p, "docx", 4) == 0) return "word,";
if(strncasecmp(p, "xls", 3) == 0) return "excel,";
@ -733,6 +751,32 @@ char *determine_attachment_type(char *filename, char *type){
}
char *get_attachment_extractor_by_filename(char *filename){
char *p;
p = strrchr(filename, '.');
if(!p) return "other";
if(strcasecmp(p, ".pdf") == 0) return "pdf";
if(strcasecmp(p, ".zip") == 0) return "zip";
if(strcasecmp(p, ".gz") == 0) return "gzip";
if(strcasecmp(p, ".rar") == 0) return "rar";
if(strcasecmp(p, ".odt") == 0) return "odf";
if(strcasecmp(p, ".odp") == 0) return "odf";
if(strcasecmp(p, ".ods") == 0) return "odf";
if(strcasecmp(p, ".doc") == 0) return "doc";
if(strcasecmp(p, ".docx") == 0) return "docx";
if(strcasecmp(p, ".xls") == 0) return "xls";
if(strcasecmp(p, ".xlsx") == 0) return "xlsx";
if(strcasecmp(p, ".ppt") == 0) return "ppt";
if(strcasecmp(p, ".pptx") == 0) return "pptx";
if(strcasecmp(p, ".txt") == 0) return "text";
if(strcasecmp(p, ".csv") == 0) return "text";
return "other";
}
void parse_reference(struct _state *state, char *s){
int len;
char puf[SMALLBUFSIZE];

View File

@ -72,7 +72,7 @@ int main(int argc, char **argv){
snprintf(sdata.filename, SMALLBUFSIZE-1, "%s", argv[1]);
snprintf(sdata.tmpframe, SMALLBUFSIZE-1, "%s.m", argv[1]);
state = parse_message(&sdata, 0, &cfg);
state = parse_message(&sdata, 1, &cfg);
post_parse(&sdata, &state, &cfg);
printf("message-id: %s\n", state.message_id);
@ -80,7 +80,7 @@ int main(int argc, char **argv){
printf("to: *%s (%s)*\n", state.b_to, state.b_to_domain);
printf("reference: *%s*\n", state.reference);
printf("subject: *%s*\n", state.b_subject);
//printf("body: *%s*\n", state.b_body);
printf("body: *%s*\n", state.b_body);
printf("sent: %ld\n", sdata.sent);
@ -103,8 +103,11 @@ int main(int argc, char **argv){
for(i=1; i<=state.n_attachments; i++){
printf("i:%d, name=*%s*, type: *%s*, size: %d, int.name: %s, digest: %s\n", i, state.attachments[i].filename, state.attachments[i].type, state.attachments[i].size, state.attachments[i].internalname, state.attachments[i].digest);
unlink(state.attachments[i].internalname);
}
unlink(sdata.tmpframe);
printf("attachments:%s\n", sdata.attachments);
printf("direction: %d\n", sdata.direction);