Improved html parsing

Signed-off-by: Janos SUTO <sj@acts.hu>
This commit is contained in:
Janos SUTO 2021-12-09 11:27:51 +01:00
parent 740855769f
commit e3973144b4
4 changed files with 49 additions and 7 deletions

View File

@ -171,6 +171,7 @@ struct parser_state {
int qp;
int htmltag;
int style;
int meta_content_type;
int skip_html;
int has_to_dump;
int has_to_dump_whole_body;

View File

@ -641,7 +641,7 @@ int parse_line(char *buf, struct parser_state *state, struct session_data *sdata
state->pushed_pointer = 0;
memset(state->type, 0, TINYBUFSIZE);
snprintf(state->charset, TINYBUFSIZE-1, "unknown");
memset(state->charset, 0, TINYBUFSIZE);
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
state->anamepos = 0;
@ -684,7 +684,18 @@ int parse_line(char *buf, struct parser_state *state, struct session_data *sdata
if(state->texthtml == 1 && state->message_state == MSG_BODY) markHTML(buf, state);
if(state->texthtml == 1) decodeHTML(buf, state->utf8);
if(state->texthtml == 1){
size_t buflen = strlen(buf);
decodeHTML(buf, state->utf8);
/* decodeHTML converted some entities to iso-8859-1 */
if(state->utf8 != 1 && strlen(buf) != buflen){
/* no charset or us-ascii: switch to iso-8859-1 */
if (state->charset[0] == 0 || strcasecmp(state->charset, "us-ascii") == 0){
syslog(LOG_PRIORITY, "%s: assuming iso-8859-1 encoding for HTML (was '%s')", sdata->ttmpfile, state->charset);
snprintf(state->charset, TINYBUFSIZE-1, "ISO8859-1");
}
}
}
/* encode the body if it's not utf-8 encoded */
if(state->message_state == MSG_BODY && state->utf8 != 1){

View File

@ -20,7 +20,7 @@ void fixupEncodedHeaderLine(char *buf, int buflen);
void fixupSoftBreakInQuotedPritableLine(char *buf, struct parser_state *state);
void fixupBase64EncodedLine(char *buf, struct parser_state *state);
void markHTML(char *buf, struct parser_state *state);
void setStateHTMLStyle(char *htmlbuf, int pos, struct parser_state *state);
void setStateHTML(char *htmlbuf, int pos, struct parser_state *state);
void translateLine(unsigned char *p, struct parser_state *state);
void fix_email_address_for_sphinx(char *s);
void split_email_address(char *s);

View File

@ -40,6 +40,7 @@ void init_state(struct parser_state *state){
state->htmltag = 0;
state->style = 0;
state->meta_content_type = 0;
state->skip_html = 0;
@ -52,6 +53,7 @@ void init_state(struct parser_state *state){
memset(state->receivedbuf, 0, sizeof(state->receivedbuf));
memset(state->type, 0, TINYBUFSIZE);
memset(state->charset, 0, TINYBUFSIZE);
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
state->anamepos = 0;
@ -551,7 +553,7 @@ void markHTML(char *buf, struct parser_state *state){
if(isspace(*s)){
if(j > 0){
setStateHTMLStyle(html, pos, state);
setStateHTML(html, pos, state);
memset(html, 0, SMALLBUFSIZE); j=0;
}
pos++;
@ -576,23 +578,51 @@ void markHTML(char *buf, struct parser_state *state){
if(j > 0){
strncat(html, " ", SMALLBUFSIZE-1);
setStateHTMLStyle(html, pos, state);
setStateHTML(html, pos, state);
memset(html, 0, SMALLBUFSIZE); j=0;
}
state->meta_content_type = 0;
}
}
//printf("append last in line:*%s*, html=+%s+, j=%d\n", puf, html, j);
if(j > 0){ setStateHTMLStyle(html, pos, state); }
if(j > 0){ setStateHTML(html, pos, state); }
strcpy(buf, puf);
}
void setStateHTMLStyle(char *htmlbuf, int pos, struct parser_state *state){
void setStateHTML(char *htmlbuf, int pos, struct parser_state *state){
if(pos == 0 && strncmp(htmlbuf, "style ", 6) == 0) state->style = 1;
if(pos == 0 && strncmp(htmlbuf, "/style ", 7) == 0) state->style = 0;
if(pos == 0 && state->charset[0] == 0 && strncmp(htmlbuf, "meta ", 5) == 0) state->meta_content_type = 0x1;
if(state->meta_content_type){
if((state->meta_content_type & 0x2) == 0 && strstr(htmlbuf, "http-equiv=content-type "))
state->meta_content_type |= 0x2;
if((state->meta_content_type & 0x4) == 0 && strstr(htmlbuf, "content=text/html;"))
state->meta_content_type |= 0x4;
if(state->meta_content_type == 0x7){
char *p, *q;
p = strstr(htmlbuf, "charset=");
if(p){
p += 8;
for(q = p; isalnum(*q) || index("-_", *q); q++)
;
if(q > p && q-p+1 < (int) sizeof(state->charset)){
syslog(LOG_PRIORITY, "Changing HTML charset from '%s' to '%*s' due to meta tag", state->charset, (int)(q-p), p);
strncpy(state->charset, p, q-p);
state->charset[q-p+1] = '\0';
state->meta_content_type = 0;
}
}
}
}
}