mirror of
https://bitbucket.org/jsuto/piler.git
synced 2025-01-12 00:00:12 +01:00
Moved tokenization code to tokenizer.c
Signed-off-by: Janos SUTO <sj@acts.hu>
This commit is contained in:
parent
898a8674e7
commit
5a548be06c
2
configure
vendored
2
configure
vendored
@ -4847,7 +4847,7 @@ echo; echo
|
||||
|
||||
CFLAGS="$static -std=c99 -O2 -fPIC -Wall -Wextra -Wuninitialized -Wno-format-truncation -g"
|
||||
LIBS="$antispam_libs $sunos_libs "
|
||||
OBJS="dirs.o base64.o misc.o counters.o cfg.o sig.o decoder.o hash.o parser.o parser_utils.o rules.o smtp.o session.o bdat.o message.o attachment.o digest.o store.o archive.o tai.o import.o import_maildir.o import_mailbox.o import_pop3.o import_imap.o imap.o pop3.o extract.o mydomains.o $objs"
|
||||
OBJS="dirs.o base64.o misc.o counters.o cfg.o sig.o decoder.o hash.o parser.o parser_utils.o rules.o smtp.o session.o bdat.o message.o attachment.o digest.o store.o archive.o tai.o import.o import_maildir.o import_mailbox.o import_pop3.o import_imap.o imap.o pop3.o extract.o mydomains.o tokenizer.o $objs"
|
||||
|
||||
ac_config_files="$ac_config_files Makefile src/Makefile etc/Makefile util/Makefile init.d/Makefile systemd/Makefile unit_tests/Makefile webui/Makefile contrib/imap/Makefile"
|
||||
|
||||
|
@ -533,7 +533,7 @@ echo; echo
|
||||
|
||||
CFLAGS="$static -std=c99 -O2 -fPIC -Wall -Wextra -Wuninitialized -Wno-format-truncation -g"
|
||||
LIBS="$antispam_libs $sunos_libs "
|
||||
OBJS="dirs.o base64.o misc.o counters.o cfg.o sig.o decoder.o hash.o parser.o parser_utils.o rules.o smtp.o session.o bdat.o message.o attachment.o digest.o store.o archive.o tai.o import.o import_maildir.o import_mailbox.o import_pop3.o import_imap.o imap.o pop3.o extract.o mydomains.o $objs"
|
||||
OBJS="dirs.o base64.o misc.o counters.o cfg.o sig.o decoder.o hash.o parser.o parser_utils.o rules.o smtp.o session.o bdat.o message.o attachment.o digest.o store.o archive.o tai.o import.o import_maildir.o import_mailbox.o import_pop3.o import_imap.o imap.o pop3.o extract.o mydomains.o tokenizer.o $objs"
|
||||
|
||||
AC_CONFIG_FILES([Makefile src/Makefile etc/Makefile util/Makefile init.d/Makefile systemd/Makefile unit_tests/Makefile webui/Makefile contrib/imap/Makefile])
|
||||
AC_OUTPUT
|
||||
|
82
src/parser.c
82
src/parser.c
@ -634,87 +634,7 @@ int parse_line(char *buf, struct parser_state *state, struct session_data *sdata
|
||||
if(result == OK) snprintf(buf, MAXBUFSIZE-1, "%s", tmpbuf);
|
||||
}
|
||||
|
||||
translateLine((unsigned char*)buf, state);
|
||||
|
||||
reassembleToken(buf);
|
||||
|
||||
|
||||
p = buf;
|
||||
|
||||
//printf("a: %d/%d/%d/%d/j=%d %s\n", state->is_1st_header, state->is_header, state->message_rfc822, state->message_state, sdata->ms_journal, buf);
|
||||
|
||||
do {
|
||||
memset(puf, 0, sizeof(puf));
|
||||
p = split(p, ' ', puf, sizeof(puf)-1, &result);
|
||||
|
||||
if(puf[0] == '\0') continue;
|
||||
|
||||
degenerateToken((unsigned char*)puf);
|
||||
|
||||
if(puf[0] == '\0') continue;
|
||||
|
||||
strncat(puf, " ", sizeof(puf)-strlen(puf)-1);
|
||||
|
||||
if(strncasecmp(puf, "http://", 7) == 0 || strncasecmp(puf, "https://", 8) == 0) fixURL(puf, sizeof(puf)-1);
|
||||
|
||||
len = strlen(puf);
|
||||
|
||||
// skip body tokens if not an URL && (empty token || too long)
|
||||
if(state->is_header == 0 && strncmp(puf, "__URL__", 7) && (puf[0] == ' ' || (len > MAX_WORD_LEN && cfg->enable_cjk == 0)) ){
|
||||
continue;
|
||||
}
|
||||
|
||||
if(state->message_state == MSG_FROM && state->is_1st_header == 1 && strlen(state->b_from) < SMALLBUFSIZE-len-1){
|
||||
strtolower(puf);
|
||||
|
||||
q = strchr(puf, '@');
|
||||
if(q) fix_plus_sign_in_email_address(puf, &q, &len);
|
||||
|
||||
memcpy(&(state->b_from[strlen(state->b_from)]), puf, len);
|
||||
|
||||
if(len >= MIN_EMAIL_ADDRESS_LEN && does_it_seem_like_an_email_address(puf) == 1 && state->b_from_domain[0] == '\0'){
|
||||
if(q && strlen(q) > 5){
|
||||
memcpy(&(state->b_from_domain), q+1, strlen(q+1)-1);
|
||||
if(strstr(sdata->mailfrom, "<>")){
|
||||
snprintf(sdata->fromemail, SMALLBUFSIZE-1, "%s", puf);
|
||||
sdata->fromemail[len-1] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
if(is_email_address_on_my_domains(puf, data) == 1) sdata->internal_sender = 1;
|
||||
|
||||
if(strlen(state->b_from) < SMALLBUFSIZE-len-1){
|
||||
split_email_address(puf);
|
||||
memcpy(&(state->b_from[strlen(state->b_from)]), puf, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if((state->message_state == MSG_TO || state->message_state == MSG_CC || state->message_state == MSG_RECIPIENT || state->message_state == MSG_ENVELOPE_TO) && state->is_1st_header == 1 && state->tolen < MAXBUFSIZE-len-1){
|
||||
strtolower(puf);
|
||||
|
||||
/* fix aaa+bbb@ccc.fu address to aaa@ccc.fu, 2017.02.04, SJ */
|
||||
q = strchr(puf, '@');
|
||||
if(q) fix_plus_sign_in_email_address(puf, &q, &len);
|
||||
|
||||
if((state->message_state == MSG_RECIPIENT || state->message_state == MSG_ENVELOPE_TO) && findnode(state->journal_recipient, puf) == NULL){
|
||||
addnode(state->journal_recipient, puf);
|
||||
memcpy(&(state->b_journal_to[state->journaltolen]), puf, len);
|
||||
if(cfg->verbosity >= _LOG_DEBUG) syslog(LOG_PRIORITY, "%s: journal rcpt: '%s'", sdata->ttmpfile, puf);
|
||||
}
|
||||
|
||||
add_recipient(puf, len, sdata, state, data, cfg);
|
||||
}
|
||||
else if(state->message_state == MSG_BODY && len >= (unsigned int)(cfg->min_word_len) && state->bodylen < BIGBUFSIZE-len-1){
|
||||
// 99% of email addresses are longer than 8 characters
|
||||
if(len >= MIN_EMAIL_ADDRESS_LEN && does_it_seem_like_an_email_address(puf)){
|
||||
fix_email_address_for_sphinx(puf);
|
||||
}
|
||||
|
||||
memcpy(&(state->b_body[state->bodylen]), puf, len);
|
||||
state->bodylen += len;
|
||||
}
|
||||
|
||||
} while(p);
|
||||
tokenize(buf, state, sdata, data, cfg);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -35,5 +35,6 @@ char *get_attachment_extractor_by_filename(char *filename);
|
||||
void parse_reference(struct parser_state *state, char *s);
|
||||
int base64_decode_attachment_buffer(char *p, unsigned char *b, int blen);
|
||||
void fix_plus_sign_in_email_address(char *puf, char **at_sign, unsigned int *len);
|
||||
void tokenize(char *buf, struct parser_state *state, struct session_data *sdata, struct data *data, struct config *cfg);
|
||||
|
||||
#endif /* _PARSER_H */
|
||||
|
105
src/tokenizer.c
Normal file
105
src/tokenizer.c
Normal file
@ -0,0 +1,105 @@
|
||||
/*
|
||||
* tokenizer.c, SJ
|
||||
*/
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ctype.h>
|
||||
#include <sys/socket.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <netinet/in.h>
|
||||
#include <arpa/inet.h>
|
||||
#include <fcntl.h>
|
||||
#include <unistd.h>
|
||||
#include <piler.h>
|
||||
|
||||
void tokenize(char *buf, struct parser_state *state, struct session_data *sdata, struct data *data, struct config *cfg){
|
||||
char *p, *q, puf[SMALLBUFSIZE];
|
||||
int result;
|
||||
unsigned int len;
|
||||
|
||||
translateLine((unsigned char*)buf, state);
|
||||
|
||||
reassembleToken(buf);
|
||||
|
||||
|
||||
p = buf;
|
||||
|
||||
//printf("a: %d/%d/%d/%d/j=%d %s\n", state->is_1st_header, state->is_header, state->message_rfc822, state->message_state, sdata->ms_journal, buf);
|
||||
|
||||
do {
|
||||
memset(puf, 0, sizeof(puf));
|
||||
p = split(p, ' ', puf, sizeof(puf)-1, &result);
|
||||
|
||||
if(puf[0] == '\0') continue;
|
||||
|
||||
degenerateToken((unsigned char*)puf);
|
||||
|
||||
if(puf[0] == '\0') continue;
|
||||
|
||||
strncat(puf, " ", sizeof(puf)-strlen(puf)-1);
|
||||
|
||||
if(strncasecmp(puf, "http://", 7) == 0 || strncasecmp(puf, "https://", 8) == 0) fixURL(puf, sizeof(puf)-1);
|
||||
|
||||
len = strlen(puf);
|
||||
|
||||
// skip body tokens if not an URL && (empty token || too long)
|
||||
if(state->is_header == 0 && strncmp(puf, "__URL__", 7) && (puf[0] == ' ' || (len > MAX_WORD_LEN && cfg->enable_cjk == 0)) ){
|
||||
continue;
|
||||
}
|
||||
|
||||
if(state->message_state == MSG_FROM && state->is_1st_header == 1 && strlen(state->b_from) < SMALLBUFSIZE-len-1){
|
||||
strtolower(puf);
|
||||
|
||||
q = strchr(puf, '@');
|
||||
if(q) fix_plus_sign_in_email_address(puf, &q, &len);
|
||||
|
||||
memcpy(&(state->b_from[strlen(state->b_from)]), puf, len);
|
||||
|
||||
if(len >= MIN_EMAIL_ADDRESS_LEN && does_it_seem_like_an_email_address(puf) == 1 && state->b_from_domain[0] == '\0'){
|
||||
if(q && strlen(q) > 5){
|
||||
memcpy(&(state->b_from_domain), q+1, strlen(q+1)-1);
|
||||
if(strstr(sdata->mailfrom, "<>")){
|
||||
snprintf(sdata->fromemail, SMALLBUFSIZE-1, "%s", puf);
|
||||
sdata->fromemail[len-1] = '\0';
|
||||
}
|
||||
}
|
||||
|
||||
if(is_email_address_on_my_domains(puf, data) == 1) sdata->internal_sender = 1;
|
||||
|
||||
if(strlen(state->b_from) < SMALLBUFSIZE-len-1){
|
||||
split_email_address(puf);
|
||||
memcpy(&(state->b_from[strlen(state->b_from)]), puf, len);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if((state->message_state == MSG_TO || state->message_state == MSG_CC || state->message_state == MSG_RECIPIENT || state->message_state == MSG_ENVELOPE_TO) && state->is_1st_header == 1 && state->tolen < MAXBUFSIZE-len-1){
|
||||
strtolower(puf);
|
||||
|
||||
/* fix aaa+bbb@ccc.fu address to aaa@ccc.fu, 2017.02.04, SJ */
|
||||
q = strchr(puf, '@');
|
||||
if(q) fix_plus_sign_in_email_address(puf, &q, &len);
|
||||
|
||||
if((state->message_state == MSG_RECIPIENT || state->message_state == MSG_ENVELOPE_TO) && findnode(state->journal_recipient, puf) == NULL){
|
||||
addnode(state->journal_recipient, puf);
|
||||
memcpy(&(state->b_journal_to[state->journaltolen]), puf, len);
|
||||
if(cfg->verbosity >= _LOG_DEBUG) syslog(LOG_PRIORITY, "%s: journal rcpt: '%s'", sdata->ttmpfile, puf);
|
||||
}
|
||||
|
||||
add_recipient(puf, len, sdata, state, data, cfg);
|
||||
}
|
||||
else if(state->message_state == MSG_BODY && len >= (unsigned int)(cfg->min_word_len) && state->bodylen < BIGBUFSIZE-len-1){
|
||||
// 99% of email addresses are longer than 8 characters
|
||||
if(len >= MIN_EMAIL_ADDRESS_LEN && does_it_seem_like_an_email_address(puf)){
|
||||
fix_email_address_for_sphinx(puf);
|
||||
}
|
||||
|
||||
memcpy(&(state->b_body[state->bodylen]), puf, len);
|
||||
state->bodylen += len;
|
||||
}
|
||||
|
||||
} while(p);
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user