mirror of
https://bitbucket.org/jsuto/piler.git
synced 2025-01-12 11:30:13 +01:00
Improved html parsing
Signed-off-by: Janos SUTO <sj@acts.hu>
This commit is contained in:
parent
740855769f
commit
e3973144b4
@ -171,6 +171,7 @@ struct parser_state {
|
||||
int qp;
|
||||
int htmltag;
|
||||
int style;
|
||||
int meta_content_type;
|
||||
int skip_html;
|
||||
int has_to_dump;
|
||||
int has_to_dump_whole_body;
|
||||
|
15
src/parser.c
15
src/parser.c
@ -641,7 +641,7 @@ int parse_line(char *buf, struct parser_state *state, struct session_data *sdata
|
||||
state->pushed_pointer = 0;
|
||||
|
||||
memset(state->type, 0, TINYBUFSIZE);
|
||||
snprintf(state->charset, TINYBUFSIZE-1, "unknown");
|
||||
memset(state->charset, 0, TINYBUFSIZE);
|
||||
|
||||
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
|
||||
state->anamepos = 0;
|
||||
@ -684,7 +684,18 @@ int parse_line(char *buf, struct parser_state *state, struct session_data *sdata
|
||||
if(state->texthtml == 1 && state->message_state == MSG_BODY) markHTML(buf, state);
|
||||
|
||||
|
||||
if(state->texthtml == 1) decodeHTML(buf, state->utf8);
|
||||
if(state->texthtml == 1){
|
||||
size_t buflen = strlen(buf);
|
||||
decodeHTML(buf, state->utf8);
|
||||
/* decodeHTML converted some entities to iso-8859-1 */
|
||||
if(state->utf8 != 1 && strlen(buf) != buflen){
|
||||
/* no charset or us-ascii: switch to iso-8859-1 */
|
||||
if (state->charset[0] == 0 || strcasecmp(state->charset, "us-ascii") == 0){
|
||||
syslog(LOG_PRIORITY, "%s: assuming iso-8859-1 encoding for HTML (was '%s')", sdata->ttmpfile, state->charset);
|
||||
snprintf(state->charset, TINYBUFSIZE-1, "ISO8859-1");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* encode the body if it's not utf-8 encoded */
|
||||
if(state->message_state == MSG_BODY && state->utf8 != 1){
|
||||
|
@ -20,7 +20,7 @@ void fixupEncodedHeaderLine(char *buf, int buflen);
|
||||
void fixupSoftBreakInQuotedPritableLine(char *buf, struct parser_state *state);
|
||||
void fixupBase64EncodedLine(char *buf, struct parser_state *state);
|
||||
void markHTML(char *buf, struct parser_state *state);
|
||||
void setStateHTMLStyle(char *htmlbuf, int pos, struct parser_state *state);
|
||||
void setStateHTML(char *htmlbuf, int pos, struct parser_state *state);
|
||||
void translateLine(unsigned char *p, struct parser_state *state);
|
||||
void fix_email_address_for_sphinx(char *s);
|
||||
void split_email_address(char *s);
|
||||
|
@ -40,6 +40,7 @@ void init_state(struct parser_state *state){
|
||||
|
||||
state->htmltag = 0;
|
||||
state->style = 0;
|
||||
state->meta_content_type = 0;
|
||||
|
||||
state->skip_html = 0;
|
||||
|
||||
@ -52,6 +53,7 @@ void init_state(struct parser_state *state){
|
||||
memset(state->receivedbuf, 0, sizeof(state->receivedbuf));
|
||||
|
||||
memset(state->type, 0, TINYBUFSIZE);
|
||||
memset(state->charset, 0, TINYBUFSIZE);
|
||||
|
||||
memset(state->attachment_name_buf, 0, SMALLBUFSIZE);
|
||||
state->anamepos = 0;
|
||||
@ -551,7 +553,7 @@ void markHTML(char *buf, struct parser_state *state){
|
||||
|
||||
if(isspace(*s)){
|
||||
if(j > 0){
|
||||
setStateHTMLStyle(html, pos, state);
|
||||
setStateHTML(html, pos, state);
|
||||
memset(html, 0, SMALLBUFSIZE); j=0;
|
||||
}
|
||||
pos++;
|
||||
@ -576,23 +578,51 @@ void markHTML(char *buf, struct parser_state *state){
|
||||
|
||||
if(j > 0){
|
||||
strncat(html, " ", SMALLBUFSIZE-1);
|
||||
setStateHTMLStyle(html, pos, state);
|
||||
setStateHTML(html, pos, state);
|
||||
memset(html, 0, SMALLBUFSIZE); j=0;
|
||||
}
|
||||
state->meta_content_type = 0;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
//printf("append last in line:*%s*, html=+%s+, j=%d\n", puf, html, j);
|
||||
if(j > 0){ setStateHTMLStyle(html, pos, state); }
|
||||
if(j > 0){ setStateHTML(html, pos, state); }
|
||||
|
||||
strcpy(buf, puf);
|
||||
}
|
||||
|
||||
|
||||
void setStateHTMLStyle(char *htmlbuf, int pos, struct parser_state *state){
|
||||
void setStateHTML(char *htmlbuf, int pos, struct parser_state *state){
|
||||
if(pos == 0 && strncmp(htmlbuf, "style ", 6) == 0) state->style = 1;
|
||||
if(pos == 0 && strncmp(htmlbuf, "/style ", 7) == 0) state->style = 0;
|
||||
|
||||
if(pos == 0 && state->charset[0] == 0 && strncmp(htmlbuf, "meta ", 5) == 0) state->meta_content_type = 0x1;
|
||||
if(state->meta_content_type){
|
||||
if((state->meta_content_type & 0x2) == 0 && strstr(htmlbuf, "http-equiv=content-type "))
|
||||
state->meta_content_type |= 0x2;
|
||||
|
||||
if((state->meta_content_type & 0x4) == 0 && strstr(htmlbuf, "content=text/html;"))
|
||||
state->meta_content_type |= 0x4;
|
||||
|
||||
if(state->meta_content_type == 0x7){
|
||||
char *p, *q;
|
||||
|
||||
p = strstr(htmlbuf, "charset=");
|
||||
if(p){
|
||||
p += 8;
|
||||
for(q = p; isalnum(*q) || index("-_", *q); q++)
|
||||
;
|
||||
|
||||
if(q > p && q-p+1 < (int) sizeof(state->charset)){
|
||||
syslog(LOG_PRIORITY, "Changing HTML charset from '%s' to '%*s' due to meta tag", state->charset, (int)(q-p), p);
|
||||
strncpy(state->charset, p, q-p);
|
||||
state->charset[q-p+1] = '\0';
|
||||
state->meta_content_type = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user