mirror of
https://bitbucket.org/jsuto/piler.git
synced 2025-01-26 11:49:59 +01:00
Improved html parsing
Signed-off-by: Janos SUTO <sj@acts.hu>
This commit is contained in:
parent
740855769f
commit
e3973144b4
@ -171,6 +171,7 @@ struct parser_state {
|
|||||||
int qp;
|
int qp;
|
||||||
int htmltag;
|
int htmltag;
|
||||||
int style;
|
int style;
|
||||||
|
int meta_content_type;
|
||||||
int skip_html;
|
int skip_html;
|
||||||
int has_to_dump;
|
int has_to_dump;
|
||||||
int has_to_dump_whole_body;
|
int has_to_dump_whole_body;
|
||||||
|
15
src/parser.c
15
src/parser.c
@ -641,7 +641,7 @@ int parse_line(char *buf, struct parser_state *state, struct session_data *sdata
|
|||||||
state->pushed_pointer = 0;
|
state->pushed_pointer = 0;
|
||||||
|
|
||||||
memset(state->type, 0, TINYBUFSIZE);
|
memset(state->type, 0, TINYBUFSIZE);
|
||||||
snprintf(state->charset, TINYBUFSIZE-1, "unknown");
|
memset(state->charset, 0, TINYBUFSIZE);
|
||||||
|
|
||||||
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
|
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
|
||||||
state->anamepos = 0;
|
state->anamepos = 0;
|
||||||
@ -684,7 +684,18 @@ int parse_line(char *buf, struct parser_state *state, struct session_data *sdata
|
|||||||
if(state->texthtml == 1 && state->message_state == MSG_BODY) markHTML(buf, state);
|
if(state->texthtml == 1 && state->message_state == MSG_BODY) markHTML(buf, state);
|
||||||
|
|
||||||
|
|
||||||
if(state->texthtml == 1) decodeHTML(buf, state->utf8);
|
if(state->texthtml == 1){
|
||||||
|
size_t buflen = strlen(buf);
|
||||||
|
decodeHTML(buf, state->utf8);
|
||||||
|
/* decodeHTML converted some entities to iso-8859-1 */
|
||||||
|
if(state->utf8 != 1 && strlen(buf) != buflen){
|
||||||
|
/* no charset or us-ascii: switch to iso-8859-1 */
|
||||||
|
if (state->charset[0] == 0 || strcasecmp(state->charset, "us-ascii") == 0){
|
||||||
|
syslog(LOG_PRIORITY, "%s: assuming iso-8859-1 encoding for HTML (was '%s')", sdata->ttmpfile, state->charset);
|
||||||
|
snprintf(state->charset, TINYBUFSIZE-1, "ISO8859-1");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/* encode the body if it's not utf-8 encoded */
|
/* encode the body if it's not utf-8 encoded */
|
||||||
if(state->message_state == MSG_BODY && state->utf8 != 1){
|
if(state->message_state == MSG_BODY && state->utf8 != 1){
|
||||||
|
@ -20,7 +20,7 @@ void fixupEncodedHeaderLine(char *buf, int buflen);
|
|||||||
void fixupSoftBreakInQuotedPritableLine(char *buf, struct parser_state *state);
|
void fixupSoftBreakInQuotedPritableLine(char *buf, struct parser_state *state);
|
||||||
void fixupBase64EncodedLine(char *buf, struct parser_state *state);
|
void fixupBase64EncodedLine(char *buf, struct parser_state *state);
|
||||||
void markHTML(char *buf, struct parser_state *state);
|
void markHTML(char *buf, struct parser_state *state);
|
||||||
void setStateHTMLStyle(char *htmlbuf, int pos, struct parser_state *state);
|
void setStateHTML(char *htmlbuf, int pos, struct parser_state *state);
|
||||||
void translateLine(unsigned char *p, struct parser_state *state);
|
void translateLine(unsigned char *p, struct parser_state *state);
|
||||||
void fix_email_address_for_sphinx(char *s);
|
void fix_email_address_for_sphinx(char *s);
|
||||||
void split_email_address(char *s);
|
void split_email_address(char *s);
|
||||||
|
@ -40,6 +40,7 @@ void init_state(struct parser_state *state){
|
|||||||
|
|
||||||
state->htmltag = 0;
|
state->htmltag = 0;
|
||||||
state->style = 0;
|
state->style = 0;
|
||||||
|
state->meta_content_type = 0;
|
||||||
|
|
||||||
state->skip_html = 0;
|
state->skip_html = 0;
|
||||||
|
|
||||||
@ -52,6 +53,7 @@ void init_state(struct parser_state *state){
|
|||||||
memset(state->receivedbuf, 0, sizeof(state->receivedbuf));
|
memset(state->receivedbuf, 0, sizeof(state->receivedbuf));
|
||||||
|
|
||||||
memset(state->type, 0, TINYBUFSIZE);
|
memset(state->type, 0, TINYBUFSIZE);
|
||||||
|
memset(state->charset, 0, TINYBUFSIZE);
|
||||||
|
|
||||||
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
|
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
|
||||||
state->anamepos = 0;
|
state->anamepos = 0;
|
||||||
@ -551,7 +553,7 @@ void markHTML(char *buf, struct parser_state *state){
|
|||||||
|
|
||||||
if(isspace(*s)){
|
if(isspace(*s)){
|
||||||
if(j > 0){
|
if(j > 0){
|
||||||
setStateHTMLStyle(html, pos, state);
|
setStateHTML(html, pos, state);
|
||||||
memset(html, 0, SMALLBUFSIZE); j=0;
|
memset(html, 0, SMALLBUFSIZE); j=0;
|
||||||
}
|
}
|
||||||
pos++;
|
pos++;
|
||||||
@ -576,23 +578,51 @@ void markHTML(char *buf, struct parser_state *state){
|
|||||||
|
|
||||||
if(j > 0){
|
if(j > 0){
|
||||||
strncat(html, " ", SMALLBUFSIZE-1);
|
strncat(html, " ", SMALLBUFSIZE-1);
|
||||||
setStateHTMLStyle(html, pos, state);
|
setStateHTML(html, pos, state);
|
||||||
memset(html, 0, SMALLBUFSIZE); j=0;
|
memset(html, 0, SMALLBUFSIZE); j=0;
|
||||||
}
|
}
|
||||||
|
state->meta_content_type = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//printf("append last in line:*%s*, html=+%s+, j=%d\n", puf, html, j);
|
//printf("append last in line:*%s*, html=+%s+, j=%d\n", puf, html, j);
|
||||||
if(j > 0){ setStateHTMLStyle(html, pos, state); }
|
if(j > 0){ setStateHTML(html, pos, state); }
|
||||||
|
|
||||||
strcpy(buf, puf);
|
strcpy(buf, puf);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void setStateHTMLStyle(char *htmlbuf, int pos, struct parser_state *state){
|
void setStateHTML(char *htmlbuf, int pos, struct parser_state *state){
|
||||||
if(pos == 0 && strncmp(htmlbuf, "style ", 6) == 0) state->style = 1;
|
if(pos == 0 && strncmp(htmlbuf, "style ", 6) == 0) state->style = 1;
|
||||||
if(pos == 0 && strncmp(htmlbuf, "/style ", 7) == 0) state->style = 0;
|
if(pos == 0 && strncmp(htmlbuf, "/style ", 7) == 0) state->style = 0;
|
||||||
|
|
||||||
|
if(pos == 0 && state->charset[0] == 0 && strncmp(htmlbuf, "meta ", 5) == 0) state->meta_content_type = 0x1;
|
||||||
|
if(state->meta_content_type){
|
||||||
|
if((state->meta_content_type & 0x2) == 0 && strstr(htmlbuf, "http-equiv=content-type "))
|
||||||
|
state->meta_content_type |= 0x2;
|
||||||
|
|
||||||
|
if((state->meta_content_type & 0x4) == 0 && strstr(htmlbuf, "content=text/html;"))
|
||||||
|
state->meta_content_type |= 0x4;
|
||||||
|
|
||||||
|
if(state->meta_content_type == 0x7){
|
||||||
|
char *p, *q;
|
||||||
|
|
||||||
|
p = strstr(htmlbuf, "charset=");
|
||||||
|
if(p){
|
||||||
|
p += 8;
|
||||||
|
for(q = p; isalnum(*q) || index("-_", *q); q++)
|
||||||
|
;
|
||||||
|
|
||||||
|
if(q > p && q-p+1 < (int) sizeof(state->charset)){
|
||||||
|
syslog(LOG_PRIORITY, "Changing HTML charset from '%s' to '%*s' due to meta tag", state->charset, (int)(q-p), p);
|
||||||
|
strncpy(state->charset, p, q-p);
|
||||||
|
state->charset[q-p+1] = '\0';
|
||||||
|
state->meta_content_type = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user