mirror of
https://bitbucket.org/jsuto/piler.git
synced 2025-06-13 12:37:03 +02:00
added odf, ms office attachment support
This commit is contained in:
@ -68,6 +68,7 @@ struct child {
|
||||
struct attachment {
|
||||
int size;
|
||||
char type[TINYBUFSIZE];
|
||||
char shorttype[TINYBUFSIZE];
|
||||
char aname[TINYBUFSIZE];
|
||||
char filename[TINYBUFSIZE];
|
||||
char internalname[TINYBUFSIZE];
|
||||
|
143
src/extract.c
143
src/extract.c
@ -4,15 +4,91 @@
|
||||
#include <unistd.h>
|
||||
#include <piler.h>
|
||||
|
||||
#ifdef HAVE_ZIP
|
||||
#include <zip.h>
|
||||
#endif
|
||||
|
||||
void extract_pdf(struct session_data *sdata, struct _state *state, char *filename, struct __config *cfg){
|
||||
|
||||
void remove_xml(char *buf, int *html){
|
||||
int i=0;
|
||||
char *p;
|
||||
|
||||
p = buf;
|
||||
|
||||
for(; *p; p++){
|
||||
if(*p == '<'){ *html = 1; }
|
||||
|
||||
if(*html == 0){
|
||||
*(buf+i) = *p;
|
||||
i++;
|
||||
}
|
||||
|
||||
if(*p == '>'){
|
||||
*html = 0;
|
||||
|
||||
if(i > 2 && *(buf+i-1) != ' '){
|
||||
*(buf+i) = ' '; i++;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
*(buf+i) = '\0';
|
||||
}
|
||||
|
||||
|
||||
int extract_opendocument(struct session_data *sdata, struct _state *state, char *filename, char *prefix){
|
||||
int errorp, i=0, len=0, html=0;
|
||||
char buf[MAXBUFSIZE];
|
||||
struct zip *z;
|
||||
struct zip_stat sb;
|
||||
struct zip_file *zf;
|
||||
|
||||
z = zip_open(filename, 0, &errorp);
|
||||
if(!z) return 1;
|
||||
|
||||
memset(buf, 0, sizeof(buf));
|
||||
|
||||
while(zip_stat_index(z, i, 0, &sb) == 0){
|
||||
if(strncmp(sb.name, prefix, strlen(prefix)) == 0){
|
||||
|
||||
zf = zip_fopen_index(z, i, 0);
|
||||
if(zf){
|
||||
while((len = zip_fread(zf, buf, sizeof(buf))) > 0){
|
||||
|
||||
remove_xml(buf, &html);
|
||||
len = strlen(buf);
|
||||
|
||||
if(state->bodylen < BIGBUFSIZE-len-1){
|
||||
memcpy(&(state->b_body[state->bodylen]), buf, len);
|
||||
state->bodylen += len;
|
||||
}
|
||||
|
||||
memset(buf, 0, sizeof(buf));
|
||||
}
|
||||
zip_fclose(zf);
|
||||
}
|
||||
|
||||
if(state->bodylen > BIGBUFSIZE-1024) break;
|
||||
}
|
||||
|
||||
i++;
|
||||
}
|
||||
|
||||
|
||||
zip_close(z);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
void read_content_with_popen(struct session_data *sdata, struct _state *state, char *cmd){
|
||||
int len;
|
||||
char buf[MAXBUFSIZE];
|
||||
FILE *f;
|
||||
|
||||
snprintf(buf, sizeof(buf)-1, "%s -enc UTF-8 %s -", HAVE_PDFTOTEXT, filename);
|
||||
|
||||
f = popen(buf, "r");
|
||||
f = popen(cmd, "r");
|
||||
if(f){
|
||||
while(fgets(buf, sizeof(buf)-1, f)){
|
||||
len = strlen(buf);
|
||||
@ -31,3 +107,62 @@ void extract_pdf(struct session_data *sdata, struct _state *state, char *filenam
|
||||
}
|
||||
|
||||
|
||||
void extract_attachment_content(struct session_data *sdata, struct _state *state, char *filename, char *type, int *rec){
|
||||
char cmd[SMALLBUFSIZE];
|
||||
|
||||
if(strcmp(type, "other") == 0) return;
|
||||
|
||||
memset(cmd, 0, sizeof(cmd));
|
||||
|
||||
#ifdef HAVE_PDFTOTEXT
|
||||
if(strcmp(type, "pdf") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -enc UTF-8 %s -", HAVE_PDFTOTEXT, filename);
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CATDOC
|
||||
if(strcmp(type, "doc") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_CATDOC, filename);
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CATPPT
|
||||
if(strcmp(type, "ppt") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_CATPPT, filename);
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_XLS2CSV
|
||||
if(strcmp(type, "xls") == 0) snprintf(cmd, sizeof(cmd)-1, "%s -d utf-8 %s", HAVE_XLS2CSV, filename);
|
||||
#endif
|
||||
|
||||
if(strlen(cmd) > 12){
|
||||
read_content_with_popen(sdata, state, cmd);
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
#ifdef HAVE_ZIP
|
||||
if(strcmp(type, "odf") == 0){
|
||||
extract_opendocument(sdata, state, filename, "content.xml");
|
||||
return;
|
||||
}
|
||||
|
||||
if(strcmp(type, "docx") == 0){
|
||||
extract_opendocument(sdata, state, filename, "word/document.xml");
|
||||
return;
|
||||
}
|
||||
|
||||
if(strcmp(type, "xlsx") == 0){
|
||||
extract_opendocument(sdata, state, filename, "xl/worksheets/sheet");
|
||||
return;
|
||||
}
|
||||
|
||||
if(strcmp(type, "pptx") == 0){
|
||||
extract_opendocument(sdata, state, filename, "ppt/slides/slide");
|
||||
return;
|
||||
}
|
||||
|
||||
if(strcmp(type, "zip") == 0 && *rec == 0){
|
||||
(*rec)++;
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
@ -1,7 +1,9 @@
|
||||
#ifndef _EXTRACT_H
|
||||
#define _EXTRACT_H
|
||||
|
||||
void extract_pdf(struct session_data *sdata, struct _state *state, char *filename, struct __config *cfg);
|
||||
#include "defs.h"
|
||||
|
||||
void extract_attachment_content(struct session_data *sdata, struct _state *state, char *filename, char *type, int *rec);
|
||||
|
||||
|
||||
#endif /* _EXTRACT_H */
|
||||
|
27
src/parser.c
27
src/parser.c
@ -100,7 +100,7 @@ struct _state parse_message(struct session_data *sdata, int take_into_pieces, st
|
||||
|
||||
|
||||
void post_parse(struct session_data *sdata, struct _state *state, struct __config *cfg){
|
||||
int i, len;
|
||||
int i, len, rec=0;
|
||||
char *p;
|
||||
|
||||
free_list(state->boundaries);
|
||||
@ -127,18 +127,12 @@ void post_parse(struct session_data *sdata, struct _state *state, struct __confi
|
||||
|
||||
p = determine_attachment_type(state->attachments[i].filename, state->attachments[i].type);
|
||||
len = strlen(p);
|
||||
|
||||
if(strlen(sdata->attachments) < SMALLBUFSIZE-len-1 && !strstr(sdata->attachments, p)) memcpy(&(sdata->attachments[strlen(sdata->attachments)]), p, len);
|
||||
|
||||
if(state->attachments[i].dumped == 1){
|
||||
|
||||
#ifdef HAVE_PDFTOTEXT
|
||||
if(
|
||||
strcmp(p, "pdf,") == 0 ||
|
||||
(strcmp(p, "other,") == 0 && strcasestr(state->attachments[i].filename, ".pdf"))
|
||||
) extract_pdf(sdata, state, state->attachments[i].aname, cfg);
|
||||
#endif
|
||||
|
||||
rec = 0;
|
||||
if(state->bodylen < BIGBUFSIZE-1024) extract_attachment_content(sdata, state, state->attachments[i].aname, get_attachment_extractor_by_filename(state->attachments[i].filename), &rec);
|
||||
|
||||
unlink(state->attachments[i].aname);
|
||||
}
|
||||
|
||||
@ -245,22 +239,29 @@ int parse_line(char *buf, struct _state *state, struct session_data *sdata, int
|
||||
if(take_into_pieces == 1){
|
||||
state->fd = open(state->attachments[state->n_attachments].internalname, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR);
|
||||
|
||||
p = determine_attachment_type(state->attachments[state->n_attachments].filename, state->attachments[state->n_attachments].type);
|
||||
|
||||
if(strcmp("pdf,", p) == 0 || strcmp("other,", p) == 0){
|
||||
p = get_attachment_extractor_by_filename(state->attachments[state->n_attachments].filename);
|
||||
|
||||
snprintf(state->attachments[state->n_attachments].shorttype, TINYBUFSIZE-1, "%s", p);
|
||||
|
||||
if(strcmp("other", p)){
|
||||
state->b64fd = open(state->attachments[state->n_attachments].aname, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR);
|
||||
state->attachments[state->n_attachments].dumped = 1;
|
||||
}
|
||||
|
||||
|
||||
|
||||
if(state->fd == -1){
|
||||
|
||||
state->attachments[state->n_attachments].size = 0;
|
||||
state->attachments[state->n_attachments].dumped = 0;
|
||||
memset(state->attachments[state->n_attachments].type, 0, TINYBUFSIZE);
|
||||
memset(state->attachments[state->n_attachments].shorttype, 0, TINYBUFSIZE);
|
||||
memset(state->attachments[state->n_attachments].aname, 0, TINYBUFSIZE);
|
||||
memset(state->attachments[state->n_attachments].filename, 0, TINYBUFSIZE);
|
||||
memset(state->attachments[state->n_attachments].internalname, 0, TINYBUFSIZE);
|
||||
memset(state->attachments[state->n_attachments].digest, 0, 2*DIGEST_LENGTH+1);
|
||||
|
||||
|
||||
syslog(LOG_PRIORITY, "%s: error opening %s", sdata->ttmpfile, state->attachments[state->n_attachments].internalname);
|
||||
|
||||
state->n_attachments--;
|
||||
|
@ -31,6 +31,7 @@ void degenerateToken(unsigned char *p);
|
||||
void fixURL(char *url);
|
||||
int extractNameFromHeaderLine(char *s, char *name, char *resultbuf);
|
||||
char *determine_attachment_type(char *filename, char *type);
|
||||
char *get_attachment_extractor_by_filename(char *filename);
|
||||
void parse_reference(struct _state *state, char *s);
|
||||
int base64_decode_attachment_buffer(char *p, int plen, unsigned char *b, int blen);
|
||||
|
||||
|
@ -76,6 +76,7 @@ void init_state(struct _state *state){
|
||||
state->attachments[i].size = 0;
|
||||
state->attachments[i].dumped = 0;
|
||||
memset(state->attachments[i].type, 0, TINYBUFSIZE);
|
||||
memset(state->attachments[i].shorttype, 0, TINYBUFSIZE);
|
||||
memset(state->attachments[i].aname, 0, TINYBUFSIZE);
|
||||
memset(state->attachments[i].filename, 0, TINYBUFSIZE);
|
||||
memset(state->attachments[i].internalname, 0, TINYBUFSIZE);
|
||||
@ -697,23 +698,40 @@ char *determine_attachment_type(char *filename, char *type){
|
||||
|
||||
if(strncasecmp(type, "application/pdf", strlen("application/pdf")) == 0) return "pdf,";
|
||||
|
||||
if(strncasecmp(type, "application/ms-tnef", strlen("application/ms-tnef")) == 0) return "winmail,";
|
||||
if(strncasecmp(type, "application/msword", strlen("application/msword")) == 0) return "word,";
|
||||
|
||||
// a .csv file has the same type
|
||||
if(strncasecmp(type, "application/vnd.ms-excel", strlen("application/vnd.ms-excel")) == 0) return "excel,";
|
||||
|
||||
if(strncasecmp(type, "application/vnd.ms-powerpoint", strlen("application/vnd.ms-powerpoint")) == 0) return "powerpoint,";
|
||||
|
||||
if(strncasecmp(type, "application/vnd.visio", strlen("application/vnd.visio")) == 0) return "visio,";
|
||||
|
||||
if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document", strlen("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) == 0) return "word,";
|
||||
if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", strlen("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) == 0) return "excel,";
|
||||
if(strncasecmp(type, "application/vnd.openxmlformats-officedocument.presentationml.presentation", strlen("application/vnd.openxmlformats-officedocument.presentationml.presentation")) == 0) return "powerpoint,";
|
||||
|
||||
if(strncasecmp(type, "application/x-shockwave-flash", strlen("application/x-shockwave-flash")) == 0) return "flash,";
|
||||
|
||||
if(strcasestr(type, "opendocument")) return "odf,";
|
||||
|
||||
if(strcasecmp(type, "application/octet-stream") == 0){
|
||||
|
||||
p = strrchr(type, '.');
|
||||
|
||||
if(strncasecmp(type, "application/", 12) == 0){
|
||||
|
||||
p = strrchr(filename, '.');
|
||||
if(p){
|
||||
p++;
|
||||
|
||||
if(strncasecmp(p, "pdf", 3) == 0) return "pdf,";
|
||||
|
||||
if(strncasecmp(p, "zip", 3) == 0) return "compressed,";
|
||||
if(strncasecmp(p, "rar", 3) == 0) return "compressed,";
|
||||
|
||||
// tar.gz has the same type
|
||||
if(strncasecmp(p, "x-gzip", 3) == 0) return "compressed,";
|
||||
|
||||
if(strncasecmp(p, "doc", 3) == 0) return "word,";
|
||||
if(strncasecmp(p, "docx", 4) == 0) return "word,";
|
||||
if(strncasecmp(p, "xls", 3) == 0) return "excel,";
|
||||
@ -733,6 +751,32 @@ char *determine_attachment_type(char *filename, char *type){
|
||||
}
|
||||
|
||||
|
||||
char *get_attachment_extractor_by_filename(char *filename){
|
||||
char *p;
|
||||
|
||||
p = strrchr(filename, '.');
|
||||
if(!p) return "other";
|
||||
|
||||
if(strcasecmp(p, ".pdf") == 0) return "pdf";
|
||||
if(strcasecmp(p, ".zip") == 0) return "zip";
|
||||
if(strcasecmp(p, ".gz") == 0) return "gzip";
|
||||
if(strcasecmp(p, ".rar") == 0) return "rar";
|
||||
if(strcasecmp(p, ".odt") == 0) return "odf";
|
||||
if(strcasecmp(p, ".odp") == 0) return "odf";
|
||||
if(strcasecmp(p, ".ods") == 0) return "odf";
|
||||
if(strcasecmp(p, ".doc") == 0) return "doc";
|
||||
if(strcasecmp(p, ".docx") == 0) return "docx";
|
||||
if(strcasecmp(p, ".xls") == 0) return "xls";
|
||||
if(strcasecmp(p, ".xlsx") == 0) return "xlsx";
|
||||
if(strcasecmp(p, ".ppt") == 0) return "ppt";
|
||||
if(strcasecmp(p, ".pptx") == 0) return "pptx";
|
||||
if(strcasecmp(p, ".txt") == 0) return "text";
|
||||
if(strcasecmp(p, ".csv") == 0) return "text";
|
||||
|
||||
return "other";
|
||||
}
|
||||
|
||||
|
||||
void parse_reference(struct _state *state, char *s){
|
||||
int len;
|
||||
char puf[SMALLBUFSIZE];
|
||||
|
@ -72,7 +72,7 @@ int main(int argc, char **argv){
|
||||
snprintf(sdata.filename, SMALLBUFSIZE-1, "%s", argv[1]);
|
||||
snprintf(sdata.tmpframe, SMALLBUFSIZE-1, "%s.m", argv[1]);
|
||||
|
||||
state = parse_message(&sdata, 0, &cfg);
|
||||
state = parse_message(&sdata, 1, &cfg);
|
||||
post_parse(&sdata, &state, &cfg);
|
||||
|
||||
printf("message-id: %s\n", state.message_id);
|
||||
@ -80,7 +80,7 @@ int main(int argc, char **argv){
|
||||
printf("to: *%s (%s)*\n", state.b_to, state.b_to_domain);
|
||||
printf("reference: *%s*\n", state.reference);
|
||||
printf("subject: *%s*\n", state.b_subject);
|
||||
//printf("body: *%s*\n", state.b_body);
|
||||
printf("body: *%s*\n", state.b_body);
|
||||
|
||||
printf("sent: %ld\n", sdata.sent);
|
||||
|
||||
@ -103,8 +103,11 @@ int main(int argc, char **argv){
|
||||
|
||||
for(i=1; i<=state.n_attachments; i++){
|
||||
printf("i:%d, name=*%s*, type: *%s*, size: %d, int.name: %s, digest: %s\n", i, state.attachments[i].filename, state.attachments[i].type, state.attachments[i].size, state.attachments[i].internalname, state.attachments[i].digest);
|
||||
unlink(state.attachments[i].internalname);
|
||||
}
|
||||
|
||||
unlink(sdata.tmpframe);
|
||||
|
||||
printf("attachments:%s\n", sdata.attachments);
|
||||
|
||||
printf("direction: %d\n", sdata.direction);
|
||||
|
Reference in New Issue
Block a user