piler/src/parser.c

791 lines
26 KiB
C
Raw Normal View History

2011-11-14 15:57:52 +01:00
/*
* parser.c, SJ
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <sys/socket.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <fcntl.h>
#include <unistd.h>
#include <piler.h>
2015-11-21 23:06:47 +01:00
struct parser_state parse_message(struct session_data *sdata, int take_into_pieces, struct __data *data, struct __config *cfg){
2011-11-14 15:57:52 +01:00
FILE *f;
2016-04-05 21:10:09 +02:00
int i;
unsigned int len;
char *p, buf[MAXBUFSIZE], puf[SMALLBUFSIZE];
2012-08-21 21:57:39 +02:00
char writebuffer[MAXBUFSIZE], abuffer[MAXBUFSIZE];
2015-11-21 23:06:47 +01:00
struct parser_state state;
2011-11-14 15:57:52 +01:00
2011-11-16 14:47:47 +01:00
init_state(&state);
2011-11-14 15:57:52 +01:00
f = fopen(sdata->filename, "r");
2011-11-14 15:57:52 +01:00
if(!f){
syslog(LOG_PRIORITY, "%s: cannot open", sdata->ttmpfile);
return state;
2014-06-04 22:20:10 +02:00
}
if(sdata->num_of_rcpt_to > 0 && cfg->process_rcpt_to_addresses == 1){
for(i=0; i<sdata->num_of_rcpt_to; i++){
2014-07-01 11:43:36 +02:00
snprintf(puf, sizeof(puf)-1, "%s ", sdata->rcptto[i]);
2014-07-01 11:43:36 +02:00
if(does_it_seem_like_an_email_address(puf) == 1){
p = strstr(puf, cfg->hostid);
if(!p){
2014-07-01 11:43:36 +02:00
strtolower(puf);
len = strlen(puf);
2014-07-01 11:43:36 +02:00
if(state.tolen < MAXBUFSIZE-len-1){
2014-07-01 11:43:36 +02:00
if(findnode(state.rcpt, puf) == NULL){
addnode(state.journal_recipient, puf);
2014-07-08 16:39:07 +02:00
addnode(state.rcpt, puf);
2014-07-01 11:43:36 +02:00
memcpy(&(state.b_journal_to[state.journaltolen]), puf, len);
state.journaltolen += len;
memcpy(&(state.b_to[state.tolen]), puf, len);
state.tolen += len;
if(state.tolen < MAXBUFSIZE-len-1){
split_email_address(puf);
2012-09-04 14:49:56 +02:00
memcpy(&(state.b_to[state.tolen]), puf, len);
state.tolen += len;
}
}
}
}
}
}
}
2012-06-01 14:25:49 +02:00
if(take_into_pieces == 1){
state.mfd = open(sdata->tmpframe, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR);
if(state.mfd == -1){
syslog(LOG_PRIORITY, "%s: cannot open frame file: %s", sdata->ttmpfile, sdata->tmpframe);
fclose(f);
return state;
}
2011-11-14 15:57:52 +01:00
}
2011-11-19 21:25:44 +01:00
while(fgets(buf, sizeof(buf)-1, f)){
2013-01-06 22:16:21 +01:00
parse_line(buf, &state, sdata, take_into_pieces, &writebuffer[0], sizeof(writebuffer), &abuffer[0], sizeof(abuffer), data, cfg);
2012-08-21 21:57:39 +02:00
}
if(take_into_pieces == 1 && state.writebufpos > 0){
2015-12-28 13:28:19 +01:00
write(state.mfd, writebuffer, state.writebufpos);
2012-08-21 21:57:39 +02:00
memset(writebuffer, 0, sizeof(writebuffer));
state.writebufpos = 0;
2012-06-01 14:25:49 +02:00
}
if(take_into_pieces == 1){
close(state.mfd); state.mfd = 0;
2011-11-19 21:25:44 +01:00
}
2011-11-14 15:57:52 +01:00
fclose(f);
return state;
}
2015-11-21 23:06:47 +01:00
void post_parse(struct session_data *sdata, struct parser_state *state, struct __config *cfg){
2016-04-05 21:10:09 +02:00
int i, rec=0;
unsigned int len;
char *p;
clearhash(state->boundaries);
clearhash(state->rcpt);
clearhash(state->rcpt_domain);
clearhash(state->journal_recipient);
2011-11-19 21:25:44 +01:00
trimBuffer(state->b_subject);
2011-11-14 15:57:52 +01:00
if(sdata->internal_sender == 0) sdata->direction = DIRECTION_INCOMING;
else {
if(sdata->internal_recipient == 1) sdata->direction = DIRECTION_INTERNAL;
if(sdata->external_recipient == 1) sdata->direction = DIRECTION_OUTGOING;
if(sdata->internal_recipient == 1 && sdata->external_recipient == 1) sdata->direction = DIRECTION_INTERNAL_AND_OUTGOING;
}
2011-11-19 21:25:44 +01:00
2011-11-28 14:21:14 +01:00
for(i=1; i<=state->n_attachments; i++){
digest_file(state->attachments[i].internalname, &(state->attachments[i].digest[0]));
2011-12-30 15:52:59 +01:00
if(cfg->verbosity >= _LOG_DEBUG) syslog(LOG_PRIORITY, "%s: attachment list: i:%d, name=*%s*, type: *%s*, size: %d, int.name: %s, digest: %s", sdata->ttmpfile, i, state->attachments[i].filename, state->attachments[i].type, state->attachments[i].size, state->attachments[i].internalname, state->attachments[i].digest);
p = determine_attachment_type(state->attachments[i].filename, state->attachments[i].type);
2011-12-30 15:52:59 +01:00
len = strlen(p);
if(strlen(sdata->attachments) < SMALLBUFSIZE-len-1 && !strstr(sdata->attachments, p)) memcpy(&(sdata->attachments[strlen(sdata->attachments)]), p, len);
2012-09-07 15:08:50 +02:00
if(state->attachments[i].dumped == 1){
rec = 0;
if(cfg->extract_attachments == 1 && state->bodylen < BIGBUFSIZE-1024) extract_attachment_content(sdata, state, state->attachments[i].aname, get_attachment_extractor_by_filename(state->attachments[i].filename), &rec, cfg);
2012-09-07 15:08:50 +02:00
unlink(state->attachments[i].aname);
}
2011-11-19 21:25:44 +01:00
}
if(state->message_id[0] == 0){
if(cfg->archive_emails_not_having_message_id == 1)
snprintf(state->message_id, SMALLBUFSIZE-1, "%s", sdata->ttmpfile);
else snprintf(state->message_id, SMALLBUFSIZE-1, "null");
}
2011-11-14 15:57:52 +01:00
2014-04-25 21:17:01 +02:00
digest_string(state->message_id, &(state->message_id_hash[0]));
2015-11-10 16:06:47 +01:00
if(sdata->sent == 0) sdata->sent = sdata->now;
2011-11-14 15:57:52 +01:00
}
2015-11-21 23:06:47 +01:00
void storno_attachment(struct parser_state *state){
2012-11-02 22:17:21 +01:00
state->has_to_dump = 0;
if(state->n_attachments <= 0) return;
state->attachments[state->n_attachments].size = 0;
state->attachments[state->n_attachments].dumped = 0;
memset(state->attachments[state->n_attachments].type, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].shorttype, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].aname, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].filename, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].internalname, 0, TINYBUFSIZE);
memset(state->attachments[state->n_attachments].digest, 0, 2*DIGEST_LENGTH+1);
state->n_attachments--;
}
2015-11-21 23:06:47 +01:00
int parse_line(char *buf, struct parser_state *state, struct session_data *sdata, int take_into_pieces, char *writebuffer, int writebuffersize, char *abuffer, int abuffersize, struct __data *data, struct __config *cfg){
char *p, *q, puf[SMALLBUFSIZE];
2012-09-07 15:08:50 +02:00
unsigned char b64buffer[MAXBUFSIZE];
2014-08-30 21:10:29 +02:00
char tmpbuf[MAXBUFSIZE];
2016-04-05 21:10:09 +02:00
int n64, writelen, boundary_line=0, result;
unsigned int len;
2011-11-14 15:57:52 +01:00
if(cfg->debug == 1) printf("line: %s", buf);
2011-11-14 15:57:52 +01:00
state->line_num++;
2011-11-19 21:25:44 +01:00
len = strlen(buf);
/*
* check a few things in the 1st header
*/
2011-12-13 17:05:22 +01:00
if(state->is_1st_header == 1){
if(strncmp(buf, "Received: by piler", strlen("Received: by piler")) == 0){
sdata->restored_copy = 1;
}
if(*(cfg->piler_header_field) != 0 && strncmp(buf, cfg->piler_header_field, strlen(cfg->piler_header_field)) == 0){
sdata->restored_copy = 1;
}
2012-01-26 14:35:51 +01:00
if(sdata->ms_journal == 0 && strncmp(buf, "X-MS-Journal-Report:", strlen("X-MS-Journal-Report:")) == 0){
//if(sdata->import == 0){
sdata->ms_journal = 1;
memset(state->message_id, 0, SMALLBUFSIZE);
//}
}
2012-09-03 10:06:34 +02:00
}
2011-11-22 12:31:54 +01:00
if(state->message_rfc822 == 0 && (buf[0] == '\r' || buf[0] == '\n') ){
state->message_state = MSG_BODY;
if(state->is_header == 1) state->is_header = 0;
state->is_1st_header = 0;
if(state->anamepos > 0){
extractNameFromHeaderLine(state->attachment_name_buf, "name", state->filename);
}
2011-11-19 21:25:44 +01:00
}
2011-11-22 12:31:54 +01:00
2012-06-01 14:25:49 +02:00
if(take_into_pieces == 1){
2013-09-11 09:19:29 +02:00
if(state->message_state == MSG_BODY && state->fd != -1 && is_substr_in_hash(state->boundaries, buf) == 0){
2012-08-21 21:57:39 +02:00
//n = write(state->fd, buf, len); // WRITE
if(len + state->abufpos > abuffersize-1){
2012-11-03 23:42:36 +01:00
write(state->fd, abuffer, state->abufpos);
2012-09-07 15:08:50 +02:00
if(state->b64fd != -1){
abuffer[state->abufpos] = '\0';
2013-03-23 17:58:58 +01:00
if(state->base64 == 1){
2016-04-05 21:10:09 +02:00
n64 = base64_decode_attachment_buffer(abuffer, &b64buffer[0], sizeof(b64buffer));
2015-12-28 13:28:19 +01:00
write(state->b64fd, b64buffer, n64);
2013-03-23 17:58:58 +01:00
}
else {
2015-12-28 13:28:19 +01:00
write(state->b64fd, abuffer, state->abufpos);
2013-03-23 17:58:58 +01:00
}
2012-09-07 15:08:50 +02:00
}
state->abufpos = 0; memset(abuffer, 0, abuffersize);
2012-08-21 21:57:39 +02:00
}
memcpy(abuffer+state->abufpos, buf, len); state->abufpos += len;
2012-06-01 14:25:49 +02:00
state->attachments[state->n_attachments].size += len;
}
else {
state->saved_size += len;
2012-08-21 21:57:39 +02:00
//n = write(state->mfd, buf, len); // WRITE
if(len + state->writebufpos > writebuffersize-1){
2012-11-03 23:42:36 +01:00
write(state->mfd, writebuffer, state->writebufpos); state->writebufpos = 0; memset(writebuffer, 0, writebuffersize);
2012-08-21 21:57:39 +02:00
}
memcpy(writebuffer+state->writebufpos, buf, len); state->writebufpos += len;
2012-06-01 14:25:49 +02:00
}
2011-11-19 21:25:44 +01:00
}
2011-11-22 12:31:54 +01:00
if(state->message_state == MSG_BODY && state->has_to_dump == 1 && state->pushed_pointer == 0){
//printf("####name: %s, type: %s, base64: %d\n", state->filename, state->type, state->base64);
state->pushed_pointer = 1;
2013-03-23 17:58:58 +01:00
// this is a real attachment to dump, it doesn't have to be base64 encoded!
if(strlen(state->filename) > 4 && strlen(state->type) > 3 && state->n_attachments < MAX_ATTACHMENTS-1){
2011-11-22 12:31:54 +01:00
state->n_attachments++;
snprintf(state->attachments[state->n_attachments].filename, TINYBUFSIZE-1, "%s", state->filename);
snprintf(state->attachments[state->n_attachments].type, TINYBUFSIZE-1, "%s", state->type);
snprintf(state->attachments[state->n_attachments].internalname, TINYBUFSIZE-1, "%s.a%d", sdata->ttmpfile, state->n_attachments);
2012-09-07 15:08:50 +02:00
snprintf(state->attachments[state->n_attachments].aname, TINYBUFSIZE-1, "%s.a%d.bin", sdata->ttmpfile, state->n_attachments);
2011-11-22 12:31:54 +01:00
//printf("DUMP FILE: %s\n", state->attachments[state->n_attachments].internalname);
2012-06-01 14:25:49 +02:00
if(take_into_pieces == 1){
state->fd = open(state->attachments[state->n_attachments].internalname, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR);
2012-09-07 15:08:50 +02:00
2014-02-11 15:34:12 +01:00
fixupEncodedHeaderLine(state->attachments[state->n_attachments].filename, TINYBUFSIZE);
p = get_attachment_extractor_by_filename(state->attachments[state->n_attachments].filename);
snprintf(state->attachments[state->n_attachments].shorttype, TINYBUFSIZE-1, "%s", p);
if(strcmp("other", p)){
2013-05-15 10:59:02 +02:00
state->b64fd = open(state->attachments[state->n_attachments].aname, O_CREAT|O_RDWR, S_IRUSR|S_IWUSR|S_IRGRP|S_IROTH);
2012-09-07 15:08:50 +02:00
state->attachments[state->n_attachments].dumped = 1;
}
2012-06-01 14:25:49 +02:00
if(state->fd == -1){
2012-11-02 22:17:21 +01:00
storno_attachment(state);
2012-06-01 14:25:49 +02:00
syslog(LOG_PRIORITY, "%s: error opening %s", sdata->ttmpfile, state->attachments[state->n_attachments].internalname);
}
else {
snprintf(puf, sizeof(puf)-1, "ATTACHMENT_POINTER_%s.a%d_XXX_PILER", sdata->ttmpfile, state->n_attachments);
2012-08-21 21:57:39 +02:00
//n = write(state->mfd, puf, strlen(puf)); // WRITE
writelen = strlen(puf);
if(writelen + state->writebufpos > writebuffersize-1){
2012-11-03 23:42:36 +01:00
write(state->mfd, writebuffer, state->writebufpos); state->writebufpos = 0; memset(writebuffer, 0, writebuffersize);
2012-08-21 21:57:39 +02:00
}
memcpy(writebuffer+state->writebufpos, puf, writelen); state->writebufpos += writelen;
2012-06-01 14:25:49 +02:00
}
2011-11-23 12:24:21 +01:00
}
2012-06-01 14:25:49 +02:00
2011-11-22 12:31:54 +01:00
}
else {
state->has_to_dump = 0;
}
}
2011-11-14 15:57:52 +01:00
if(*buf == '.' && *(buf+1) == '.') buf++;
/* undefined message state */
if(state->is_header == 1 && buf[0] != ' ' && buf[0] != '\t' && strchr(buf, ':')) state->message_state = MSG_UNDEF;
/* skip empty lines */
if(state->message_rfc822 == 0 && (buf[0] == '\r' || buf[0] == '\n') ){
return 0;
}
2013-02-19 22:28:44 +01:00
trimBuffer(buf);
2011-11-14 15:57:52 +01:00
/* skip the first line, if it's a "From <email address> date" format */
if(state->line_num == 1 && strncmp(buf, "From ", 5) == 0) return 0;
if(state->is_header == 0 && buf[0] != ' ' && buf[0] != '\t') state->message_state = MSG_BODY;
// journal fix
if(state->message_state == MSG_BODY && sdata->ms_journal == 1){
state->is_header = 1;
state->is_1st_header = 1;
}
2011-11-14 15:57:52 +01:00
/* header checks */
if(state->is_header == 1){
if(*(cfg->spam_header_line) != '\0' && strncmp(buf, cfg->spam_header_line, strlen(cfg->spam_header_line)) == 0){
sdata->spam_message = 1;
}
2011-11-16 14:47:47 +01:00
if(strncasecmp(buf, "From:", strlen("From:")) == 0) state->message_state = MSG_FROM;
2013-03-24 01:20:12 +01:00
else if(strncasecmp(buf, "Content-Type:", strlen("Content-Type:")) == 0){
state->message_state = MSG_CONTENT_TYPE;
if(state->anamepos > 0){
extractNameFromHeaderLine(state->attachment_name_buf, "name", state->filename);
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
state->anamepos = 0;
}
}
2013-03-24 01:20:12 +01:00
else if(strncasecmp(buf, "Content-Transfer-Encoding:", strlen("Content-Transfer-Encoding:")) == 0) state->message_state = MSG_CONTENT_TRANSFER_ENCODING;
else if(strncasecmp(buf, "Content-Disposition:", strlen("Content-Disposition:")) == 0){
state->message_state = MSG_CONTENT_DISPOSITION;
if(state->anamepos > 0){
extractNameFromHeaderLine(state->attachment_name_buf, "name", state->filename);
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
state->anamepos = 0;
}
2013-03-24 01:20:12 +01:00
}
2011-11-14 15:57:52 +01:00
else if(strncasecmp(buf, "To:", 3) == 0) state->message_state = MSG_TO;
else if(strncasecmp(buf, "Cc:", 3) == 0) state->message_state = MSG_CC;
else if(strncasecmp(buf, "Bcc:", 4) == 0) state->message_state = MSG_CC;
2011-11-14 15:57:52 +01:00
else if(strncasecmp(buf, "Message-Id:", 11) == 0) state->message_state = MSG_MESSAGE_ID;
2012-02-08 23:14:28 +01:00
else if(strncasecmp(buf, "References:", 11) == 0) state->message_state = MSG_REFERENCES;
2011-11-14 15:57:52 +01:00
else if(strncasecmp(buf, "Subject:", strlen("Subject:")) == 0) state->message_state = MSG_SUBJECT;
2012-09-03 10:06:34 +02:00
else if(strncasecmp(buf, "Recipient:", strlen("Recipient:")) == 0) state->message_state = MSG_RECIPIENT;
if(sdata->ms_journal == 1 && (state->message_state == MSG_TO || state->message_state == MSG_RECIPIENT) ){
p = strstr(buf, "Expanded:");
if(p) *p = '\0';
}
/*
2015-11-10 16:06:47 +01:00
* by default sdata->sent = 0, and let the parser extract value from the Date: header
*/
2015-11-10 16:06:47 +01:00
else if(strncasecmp(buf, "Date:", strlen("Date:")) == 0 && state->is_1st_header == 1 && sdata->sent == 0){
2014-12-03 14:30:26 +01:00
if(strstr(buf, "=?") && strstr(buf, "?=")) fixupEncodedHeaderLine(buf, MAXBUFSIZE);
2016-04-05 21:10:09 +02:00
sdata->sent = parse_date_header(buf);
2015-11-10 16:06:47 +01:00
/* allow +2 days drift in the parsed Date: value */
if(sdata->sent - sdata->now > 2*86400) sdata->sent = sdata->now;
}
2016-04-05 21:10:09 +02:00
else if(strncasecmp(buf, "Delivery-date:", strlen("Delivery-date:")) == 0 && sdata->delivered == 0) sdata->delivered = parse_date_header(buf);
else if(strncasecmp(buf, "Received:", strlen("Received:")) == 0) state->message_state = MSG_RECEIVED;
else if(cfg->extra_to_field[0] != '\0' && strncasecmp(buf, cfg->extra_to_field, strlen(cfg->extra_to_field)) == 0) state->message_state = MSG_TO;
2011-11-14 15:57:52 +01:00
if(state->message_state == MSG_MESSAGE_ID && state->message_id[0] == 0){
p = strchr(buf+11, ' ');
if(p) p = buf + 12;
else p = buf + 11;
snprintf(state->message_id, SMALLBUFSIZE-1, "%s", p);
}
/* we are interested in only From:, To:, Subject:, Received:, Content-*: header lines */
if(state->message_state <= 0) return 0;
}
2013-03-24 01:20:12 +01:00
if(state->message_state == MSG_CONTENT_TYPE){
if((p = strcasestr(buf, "boundary"))){
extract_boundary(p, state);
}
2011-11-14 15:57:52 +01:00
}
2015-01-19 16:00:38 +01:00
/*
* A normal journal looks like this:
*
* Sender: sender@domain
* Subject: Test normal
* Message-Id: ...
* Recipient: user1@domain
* Recipient: user2@domain, Forwarded: user1@domain
*
* However if outlook forwards an email, then the journal is somewhat changed:
*
* Sender: sender@domain
* Subject: Test through outlook
* Message-Id: ...
* To: user1@domain
* To: user2@domain, Forwarded: user1@domain
*
*
* Outlook.com has the following scheme, when expanded from a distribution list:
*
* Sender: sender@domain
* Subject: Test Email
* Message-Id: ...
* To: user1@domain, Expanded: listaddress@domain
* To: user2@domain, Expanded: listaddress@domain
*
2015-01-19 16:00:38 +01:00
*/
2012-09-03 10:06:34 +02:00
2012-02-08 23:14:28 +01:00
if(state->is_1st_header == 1 && state->message_state == MSG_REFERENCES){
if(strncasecmp(buf, "References:", 11) == 0) parse_reference(state, buf+11);
else parse_reference(state, buf);
}
if(state->is_1st_header == 1){
2011-11-28 14:21:14 +01:00
if(state->message_state == MSG_SUBJECT && strlen(state->b_subject) + strlen(buf) < MAXBUFSIZE-1){
2012-10-02 15:21:16 +02:00
if(state->b_subject[0] == '\0'){
p = &buf[0];
if(strncmp(buf, "Subject:", strlen("Subject:")) == 0) p += strlen("Subject:");
if(*p == ' ') p++;
2011-11-28 14:21:14 +01:00
fixupEncodedHeaderLine(p, MAXBUFSIZE);
strncat(state->b_subject, p, MAXBUFSIZE-strlen(state->b_subject)-1);
}
else {
2015-01-09 12:00:59 +01:00
/*
* if the next subject line is encoded, then strip the whitespace characters at the beginning of the line
*/
2015-01-09 12:00:59 +01:00
p = buf;
2011-11-28 14:21:14 +01:00
if(strcasestr(buf, "?Q?") || strcasestr(buf, "?B?")){
while(isspace(*p)) p++;
}
2011-11-28 14:21:14 +01:00
fixupEncodedHeaderLine(p, MAXBUFSIZE);
2015-01-09 12:00:59 +01:00
strncat(state->b_subject, p, MAXBUFSIZE-strlen(state->b_subject)-1);
}
}
else { fixupEncodedHeaderLine(buf, MAXBUFSIZE); }
}
2011-11-16 14:47:47 +01:00
2011-11-14 15:57:52 +01:00
/* Content-type: checking */
if(state->message_state == MSG_CONTENT_TYPE){
state->message_rfc822 = 0;
/* extract Content type */
p = strchr(buf, ':');
if(p){
p++;
if(*p == ' ' || *p == '\t') p++;
2011-11-22 12:31:54 +01:00
snprintf(state->type, TINYBUFSIZE-1, "%s", p);
2013-03-24 01:20:12 +01:00
//state->content_type_is_set = 1;
2011-11-22 12:31:54 +01:00
p = strchr(state->type, ';');
2011-11-14 15:57:52 +01:00
if(p) *p = '\0';
}
if(strcasestr(buf, "text/plain") ||
strcasestr(buf, "multipart/mixed") ||
strcasestr(buf, "multipart/alternative") ||
strcasestr(buf, "multipart/report") ||
strcasestr(buf, "message/delivery-status") ||
strcasestr(buf, "text/rfc822-headers") ||
2013-09-11 09:19:29 +02:00
strcasestr(buf, "message/rfc822")
2011-11-14 15:57:52 +01:00
){
2011-11-16 14:47:47 +01:00
state->textplain = 1;
2011-11-14 15:57:52 +01:00
}
else if(strcasestr(buf, "text/html")){
2011-11-16 14:47:47 +01:00
state->texthtml = 1;
2011-11-14 15:57:52 +01:00
}
2011-11-16 14:47:47 +01:00
/* switch (back) to header mode if we encounterd an attachment with "message/rfc822" content-type */
2011-11-14 15:57:52 +01:00
if(strcasestr(buf, "message/rfc822")){
state->message_rfc822 = 1;
state->is_header = 1;
if(sdata->ms_journal == 1){
state->is_1st_header = 1;
// reset all headers, except To:
memset(state->b_subject, 0, MAXBUFSIZE);
memset(state->b_body, 0, BIGBUFSIZE);
memset(state->b_from, 0, SMALLBUFSIZE);
memset(state->b_from_domain, 0, SMALLBUFSIZE);
memset(state->message_id, 0, SMALLBUFSIZE);
sdata->ms_journal = 0;
}
2011-11-14 15:57:52 +01:00
}
2014-08-30 21:10:29 +02:00
if(strcasestr(buf, "charset")) extractNameFromHeaderLine(buf, "charset", state->charset);
if(strcasestr(state->charset, "UTF-8")) state->utf8 = 1;
2011-11-19 21:25:44 +01:00
}
2011-11-14 15:57:52 +01:00
2011-11-22 12:31:54 +01:00
if((state->message_state == MSG_CONTENT_TYPE || state->message_state == MSG_CONTENT_DISPOSITION) && strlen(state->filename) < 5){
p = &buf[0];
for(; *p; p++){
if(*p != ' ' && *p != '\t') break;
}
len = strlen(p);
if(len + state->anamepos < SMALLBUFSIZE-2){
memcpy(&(state->attachment_name_buf[state->anamepos]), p, len);
state->anamepos += len;
}
2011-11-16 14:47:47 +01:00
}
2011-11-14 15:57:52 +01:00
if(state->message_state == MSG_CONTENT_TRANSFER_ENCODING){
2011-11-22 12:31:54 +01:00
if(strcasestr(buf, "base64")) state->base64 = 1;
2011-11-14 15:57:52 +01:00
if(strcasestr(buf, "quoted-printable")) state->qp = 1;
}
2011-11-16 14:47:47 +01:00
2011-11-22 12:31:54 +01:00
/* boundary check, and reset variables */
2011-11-14 15:57:52 +01:00
2013-08-22 00:33:39 +02:00
boundary_line = is_substr_in_hash(state->boundaries, buf);
2011-11-14 15:57:52 +01:00
if(!strstr(buf, "boundary=") && !strstr(buf, "boundary =") && boundary_line == 1){
2013-03-24 01:20:12 +01:00
state->is_header = 1;
//state->content_type_is_set = 0;
2011-11-14 15:57:52 +01:00
2011-11-16 14:47:47 +01:00
if(state->has_to_dump == 1){
2012-08-21 21:57:39 +02:00
if(take_into_pieces == 1 && state->fd != -1){
if(state->abufpos > 0){
2012-11-03 23:42:36 +01:00
write(state->fd, abuffer, state->abufpos);
2012-09-07 15:08:50 +02:00
if(state->b64fd != -1){
abuffer[state->abufpos] = '\0';
2013-03-23 17:58:58 +01:00
if(state->base64 == 1){
2016-04-05 21:10:09 +02:00
n64 = base64_decode_attachment_buffer(abuffer, &b64buffer[0], sizeof(b64buffer));
2015-12-28 13:28:19 +01:00
write(state->b64fd, b64buffer, n64);
2013-03-23 17:58:58 +01:00
}
else {
2015-12-28 13:28:19 +01:00
write(state->b64fd, abuffer, state->abufpos);
2013-03-23 17:58:58 +01:00
}
2012-09-07 15:08:50 +02:00
}
state->abufpos = 0; memset(abuffer, 0, abuffersize);
2012-08-21 21:57:39 +02:00
}
close(state->fd);
2012-09-07 15:08:50 +02:00
close(state->b64fd);
2012-08-21 21:57:39 +02:00
}
2011-11-14 15:57:52 +01:00
state->fd = -1;
2012-09-07 15:08:50 +02:00
state->b64fd = -1;
2011-11-16 14:47:47 +01:00
}
2011-11-14 15:57:52 +01:00
2011-11-22 12:31:54 +01:00
state->has_to_dump = 1;
2011-11-14 15:57:52 +01:00
state->base64 = 0; state->textplain = 0; state->texthtml = state->octetstream = 0;
state->skip_html = 0;
state->utf8 = 0;
state->qp = 0;
state->realbinary = 0;
2011-11-19 21:25:44 +01:00
state->pushed_pointer = 0;
2011-11-22 12:31:54 +01:00
memset(state->filename, 0, TINYBUFSIZE);
memset(state->type, 0, TINYBUFSIZE);
2014-08-30 21:10:29 +02:00
snprintf(state->charset, TINYBUFSIZE-1, "unknown");
2011-11-22 12:31:54 +01:00
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
state->anamepos = 0;
2011-11-22 12:31:54 +01:00
state->message_state = MSG_UNDEF;
2011-11-14 15:57:52 +01:00
return 0;
}
if(boundary_line == 1){ return 0; }
/* end of boundary check */
2011-11-28 14:21:14 +01:00
/* skip irrelevant headers */
2012-09-03 10:06:34 +02:00
if(state->is_header == 1 && state->message_state != MSG_FROM && state->message_state != MSG_TO && state->message_state != MSG_CC && state->message_state != MSG_RECIPIENT) return 0;
2011-11-14 15:57:52 +01:00
2011-11-19 21:25:44 +01:00
/* don't process body if it's not a text or html part */
if(state->message_state == MSG_BODY && state->textplain == 0 && state->texthtml == 0) return 0;
2011-11-14 15:57:52 +01:00
2011-11-16 14:47:47 +01:00
if(state->base64 == 1 && state->message_state == MSG_BODY){
2012-11-03 23:42:36 +01:00
decodeBase64(buf);
2011-11-16 14:47:47 +01:00
fixupBase64EncodedLine(buf, state);
}
2011-11-14 15:57:52 +01:00
/* remove all HTML tags */
2011-11-14 15:57:52 +01:00
if(state->texthtml == 1 && state->message_state == MSG_BODY) markHTML(buf, state);
2011-12-07 15:24:52 +01:00
if(state->message_state == MSG_BODY && state->qp == 1){
fixupSoftBreakInQuotedPritableLine(buf, state); // 2011.12.07
decodeQP(buf);
2011-11-14 15:57:52 +01:00
}
/* I believe that we can live without this function call */
//decodeURL(buf);
2011-11-14 15:57:52 +01:00
2014-08-30 21:10:29 +02:00
if(state->texthtml == 1) decodeHTML(buf, state->utf8);
2011-11-14 15:57:52 +01:00
2011-12-07 15:24:52 +01:00
/* encode the body if it's not utf-8 encoded */
2014-08-30 21:10:29 +02:00
if(state->message_state == MSG_BODY && state->utf8 != 1){
result = utf8_encode(buf, strlen(buf), &tmpbuf[0], sizeof(tmpbuf), state->charset);
if(result == OK) snprintf(buf, MAXBUFSIZE-1, "%s", tmpbuf);
}
2011-11-14 15:57:52 +01:00
translateLine((unsigned char*)buf, state);
reassembleToken(buf);
if(state->is_header == 1) p = strchr(buf, ' ');
else p = buf;
//printf("a: %d/%d/%d/%d/j=%d %s\n", state->is_1st_header, state->is_header, state->message_rfc822, state->message_state, sdata->ms_journal, buf);
2011-11-28 14:21:14 +01:00
2011-11-14 15:57:52 +01:00
do {
2011-11-16 14:47:47 +01:00
memset(puf, 0, sizeof(puf));
2014-07-22 16:00:03 +02:00
p = split(p, ' ', puf, sizeof(puf)-1, &result);
2011-11-14 15:57:52 +01:00
2011-11-16 14:47:47 +01:00
if(puf[0] == '\0') continue;
2011-11-14 15:57:52 +01:00
2011-11-16 14:47:47 +01:00
degenerateToken((unsigned char*)puf);
2011-11-14 15:57:52 +01:00
2011-11-16 14:47:47 +01:00
if(puf[0] == '\0') continue;
2011-11-14 15:57:52 +01:00
2015-12-29 15:09:54 +01:00
strncat(puf, " ", sizeof(puf)-strlen(puf)-1);
2011-11-14 15:57:52 +01:00
2015-12-29 15:09:54 +01:00
if(strncasecmp(puf, "http://", 7) == 0 || strncasecmp(puf, "https://", 8) == 0) fixURL(puf, sizeof(puf)-1);
2011-11-14 15:57:52 +01:00
2015-12-29 15:09:54 +01:00
len = strlen(puf);
2011-11-14 15:57:52 +01:00
2015-12-28 13:28:19 +01:00
if(state->is_header == 0 && strncmp(puf, "__URL__", 7) && (puf[0] == ' ' || (len > MAX_WORD_LEN && cfg->enable_cjk == 0) || isHexNumber(puf)) ) continue;
2011-11-14 15:57:52 +01:00
if(state->message_state == MSG_FROM && state->is_1st_header == 1 && strlen(state->b_from) < SMALLBUFSIZE-len-1){
2014-11-25 15:54:50 +01:00
strtolower(puf);
q = strchr(puf, '@');
if(q) fix_plus_sign_in_email_address(puf, &q, &len);
2011-11-16 14:47:47 +01:00
memcpy(&(state->b_from[strlen(state->b_from)]), puf, len);
2011-11-14 15:57:52 +01:00
if(len >= MIN_EMAIL_ADDRESS_LEN && does_it_seem_like_an_email_address(puf) == 1 && state->b_from_domain[0] == '\0'){
2012-11-28 23:15:14 +01:00
if(q && strlen(q) > 5){
memcpy(&(state->b_from_domain), q+1, strlen(q+1)-1);
if(strstr(sdata->mailfrom, "<>")){
snprintf(sdata->fromemail, SMALLBUFSIZE-1, "%s", puf);
sdata->fromemail[len-1] = '\0';
}
2012-10-02 15:21:16 +02:00
}
2013-01-06 22:16:21 +01:00
if(is_email_address_on_my_domains(puf, data) == 1) sdata->internal_sender = 1;
if(strlen(state->b_from) < SMALLBUFSIZE-len-1){
split_email_address(puf);
memcpy(&(state->b_from[strlen(state->b_from)]), puf, len);
}
}
}
2012-09-03 10:06:34 +02:00
else if((state->message_state == MSG_TO || state->message_state == MSG_CC || state->message_state == MSG_RECIPIENT) && state->is_1st_header == 1 && state->tolen < MAXBUFSIZE-len-1){
strtolower(puf);
/* fix aaa+bbb@ccc.fu address to aaa@ccc.fu, 2017.02.04, SJ */
q = strchr(puf, '@');
if(q) fix_plus_sign_in_email_address(puf, &q, &len);
if(state->message_state == MSG_RECIPIENT && findnode(state->journal_recipient, puf) == NULL){
addnode(state->journal_recipient, puf);
memcpy(&(state->b_journal_to[state->journaltolen]), puf, len);
if(cfg->verbosity >= _LOG_DEBUG) syslog(LOG_PRIORITY, "%s: journal rcpt: '%s'", sdata->ttmpfile, puf);
}
2011-11-14 15:57:52 +01:00
if(findnode(state->rcpt, puf) == NULL){
/* skip any address matching ...@cfg->hostid, 2013.10.29, SJ */
if(q && strncmp(q+1, cfg->hostid, cfg->hostid_len) == 0){
continue;
}
addnode(state->rcpt, puf);
2012-08-21 21:57:39 +02:00
memcpy(&(state->b_to[state->tolen]), puf, len);
state->tolen += len;
if(len >= MIN_EMAIL_ADDRESS_LEN && does_it_seem_like_an_email_address(puf) == 1){
2013-01-06 22:16:21 +01:00
if(is_email_address_on_my_domains(puf, data) == 1) sdata->internal_recipient = 1;
else sdata->external_recipient = 1;
if(q){
if(findnode(state->rcpt_domain, q+1) == NULL){
addnode(state->rcpt_domain, q+1);
memcpy(&(state->b_to_domain[strlen(state->b_to_domain)]), q+1, strlen(q+1));
}
}
2012-08-21 21:57:39 +02:00
if(state->tolen < MAXBUFSIZE-len-1){
split_email_address(puf);
2012-08-21 21:57:39 +02:00
memcpy(&(state->b_to[state->tolen]), puf, len);
state->tolen += len;
}
}
2011-11-28 14:21:14 +01:00
}
2011-11-28 14:21:14 +01:00
}
else if(state->message_state == MSG_BODY && len >= cfg->min_word_len && state->bodylen < BIGBUFSIZE-len-1){
// 99% of email addresses are longer than 8 characters
if(len >= MIN_EMAIL_ADDRESS_LEN && does_it_seem_like_an_email_address(puf)){
fix_email_address_for_sphinx(puf);
}
2012-08-21 21:57:39 +02:00
memcpy(&(state->b_body[state->bodylen]), puf, len);
state->bodylen += len;
}
2011-11-14 15:57:52 +01:00
2011-11-16 14:47:47 +01:00
} while(p);
2011-11-14 15:57:52 +01:00
return 0;
}