static unsigned int get_unicode( str *s, unsigned int *pi, int charsetin, int latexin, int utf8in, int xmlin ) { unsigned int ch; int unicode = 0, err = 0; if ( xmlin && s->data[*pi]=='&' ) { ch = decode_entity( s->data, pi, &unicode, &err ); } else if ( charsetin==CHARSET_GB18030 ) { ch = gb18030_decode( s->data, pi ); unicode = 1; } else if ( latexin ) { /* Must handle bibtex files in UTF8/Unicode */ if ( utf8in && ( s->data[*pi] & 128 ) ) { ch = utf8_decode( s->data, pi ); unicode = 1; } else ch = latex2char( s->data, pi, &unicode ); } else if ( utf8in ) ch = utf8_decode( s->data, pi ); else { ch = (unsigned int) s->data[*pi]; *pi = *pi + 1; } if ( !unicode && charsetin!=CHARSET_UNICODE ) ch = charset_lookupchar( charsetin, ch ); return ch; }
static XMLSTRING make_unescaped_string(char *p, char *s) { XMLSTRING buf = xmls_new(s-p); while(p<s) { char c = *p++; if(c=='&') { decode_entity(&p, buf); } else { xmls_add_char(buf, c); } } return buf; }
int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) { size_t i, j, ampStart, ampStartDest; int uc; int hex; unsigned int hash; assert(max != 0); for(i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0, uc = -1, hex = 0, hash = 0 ; src[i] != '\0' ; i++) { /* start of entity */ if (src[i] == '&') { ampStart = i; ampStartDest = j; hash = 0; uc = -1; } /* inside a potential entity */ else if (ampStart != (size_t) -1) { /* &#..; entity */ if (ampStart + 1 == i && src[ampStart + 1] == '#') { uc = 0; hex = 0; } /* &#x..; entity */ else if (ampStart + 2 == i && src[ampStart + 1] == '#' && src[ampStart + 2] == 'x') { hex = 1; } /* end of entity */ else if (src[i] == ';') { size_t len; /* decode entity */ if (uc == -1) { /* &foo; */ uc = decode_entity(hash, /*&src[ampStart + 1],*/ i - ampStart - 1); /* FIXME: TEMPORARY HACK FROM PREVIOUS VERSION TO BE INVESTIGATED */ if (uc == 160) { uc = 32; } } /* end */ ampStart = (size_t) -1; /* success ? */ if (uc > 0) { const size_t maxOut = max - ampStartDest; /* write at position */ if (charset != NULL && hts_isCharsetUTF8(charset)) { len = hts_writeUTF8(uc, &dest[ampStartDest], maxOut); } else { size_t ulen; char buffer[32]; len = 0; if ( ( ulen = hts_writeUTF8(uc, buffer, sizeof(buffer)) ) != 0) { char *s; buffer[ulen] = '\0'; s = hts_convertStringFromUTF8(buffer, strlen(buffer), charset); if (s != NULL) { const size_t sLen = strlen(s); if (sLen < maxOut) { /* Do not copy \0. */ memcpy(&dest[ampStartDest], s, sLen); len = sLen; } free(s); } } } if (len > 0) { /* new dest position */ j = ampStartDest + len; /* do not copy ; */ continue; } } } /* numerical entity */ else if (uc != -1) { /* decimal */ if (!hex) { if (src[i] >= '0' && src[i] <= '9') { const int h = src[i] - '0'; uc *= 10; uc += h; } else { /* abandon */ ampStart = (size_t) -1; } } /* hex */ else { const int h = get_hex_value(src[i]); if (h != -1) { uc *= 16; uc += h; } else { /* abandon */ ampStart = (size_t) -1; } } } /* alphanumerical entity */ else { /* alphanum and not too far ('ϑ' is the longest) */ if (i <= ampStart + 10 && ( (src[i] >= '0' && src[i] <= '9') || (src[i] >= 'A' && src[i] <= 'Z') || (src[i] >= 'a' && src[i] <= 'z') ) ) { /* compute hash */ HASH_ADD(hash, (unsigned char) src[i]); } else { /* abandon */ ampStart = (size_t) -1; } } } /* copy */ if (j + 1 > max) { /* overflow */ return -1; } if (src != dest || i != j) { dest[j] = src[i]; } j++; } dest[j] = '\0'; return 0; }
const char *tag_handler_anchor(struct taginfo *taginfo, map_arg_t* maparg, spec_tag_t *spec_tag) { const char *block_beg, *block_end, *p; buffer_t *anchor_text_buffer = spec_tag->handler_arg; block_beg = taginfo->end_position; block_end = stristrb(block_beg, maparg->html_content_end_position - block_beg, spec_tag->end_str, strlen(spec_tag->end_str)); if (block_end == NULL) { maparg->last_tag_end_position = taginfo->end_position; return (taginfo->end_position - 1); } else { /* copy anchor text to anchor_text_buffer */ int n = 0; char * ank_dst; ank_dst = anchor_text_buffer->p + anchor_text_buffer->pos; while (block_beg + n != block_end) { /* whitespace char */ if (*(block_beg + n) < ' ' && *(block_beg + n) >= 0) { n++; continue; } //space if (*(block_beg + n) == ' ') { if (anchor_text_buffer->reserve1 == 1) { n++; continue; } else { *ank_dst = ' '; anchor_text_buffer->reserve1 = 1; n++; goto ank_dst_add; } } /* decode entities only */ if (*(block_beg + n) == '&' && *(block_beg + n + 1) == 'n' && *(block_beg + n + 2) == 'b' && *(block_beg + n + 3) == 's' && *(block_beg + n + 4) == 'p' && *(block_beg + n + 5) == ';') { const char *tp = block_beg + n; int ret; ret = decode_entity(&tp, block_end); if (ret != -1) { if (ret == ' ') { if (anchor_text_buffer->reserve1 == 1) { /* */ /* ^ ^tp */ n = tp - block_beg; continue; } *ank_dst = ' '; anchor_text_buffer->reserve1 = 1; n = tp - block_beg; goto ank_dst_add; } *ank_dst = ret; anchor_text_buffer->reserve1 = 0; n = tp - block_beg; goto ank_dst_add; } } /* skip html tag between <a>...</a> */ if (*(block_beg + n) == '<') { char *close_tag; close_tag = (char *)memchr(block_beg + n + 1, '>', block_end - (block_beg + n + 1)); if (close_tag == NULL) /* error html tag,skip this anchor tag */ { p = memchr(block_end, '>', maparg->html_content_end_position - block_end); if (p == NULL) { maparg->last_tag_end_position = taginfo->end_position; return NULL; } maparg->last_tag_end_position = p + 1; return p; } n += (close_tag - (block_beg + n) + 1); continue; } /* ascii char */ if (*((unsigned char *)(block_beg + n)) < 0x80) { *ank_dst = *(block_beg + n); anchor_text_buffer->reserve1 = 0; n++; goto ank_dst_add; } else /* multi-byte char */ { int chlen,cn; if(maparg->codetype == CODETYPE_GBK) chlen = GBKCHLEN((unsigned char)block_beg[n]); else if(maparg->codetype == CODETYPE_UTF8) chlen = UTF8CHLEN((unsigned char)block_beg[n]); else chlen = 2; if (block_beg + n + chlen <= block_end) { //GBK中中文空格为A1A1,UTF-8中中文空格为E38080 if (maparg->is_reserve_indent == 0 && ( \ (maparg->codetype == CODETYPE_GBK && chlen == 2 && *(block_beg + n) == '\xa1' && *(block_beg + n + 1) == '\xa1')\ ||(maparg->codetype == CODETYPE_UTF8 && chlen == 3 \ && *(block_beg + n) == '\xe3' && *(block_beg + n + 1) == '\x80' && *(block_beg + n + 2) == '\x80'))) { if (anchor_text_buffer->reserve1 == 1) { n += chlen; continue; } else { *ank_dst = ' '; anchor_text_buffer->reserve1 = 1; n += chlen; goto ank_dst_add; } } else { for(cn=0;cn<chlen;++cn) { *ank_dst = *(block_beg + n + cn); ++ank_dst; anchor_text_buffer->pos++; anchor_text_buffer->free--; if (anchor_text_buffer->free <= 1) { maparg->last_tag_end_position = taginfo->end_position; return NULL; /* html page parser is end */ } } anchor_text_buffer->reserve1 = 0; n += chlen; continue; } } else { //this multi byte char error. break; } } ank_dst_add: ++ank_dst; anchor_text_buffer->pos++; anchor_text_buffer->free--; if (anchor_text_buffer->free <= 1) { maparg->last_tag_end_position = taginfo->end_position; return NULL; /* html page parser is end */ } }/* end for while */ if (anchor_text_buffer->reserve1 == 0) { *ank_dst = ' '; anchor_text_buffer->reserve1 = 1; ++ank_dst; anchor_text_buffer->pos++; anchor_text_buffer->free--; if (anchor_text_buffer->free == 0) { maparg->last_tag_end_position = taginfo->end_position; return NULL; /* html page parser is end */ } } p = memchr(block_end, '>', maparg->html_content_end_position - block_end); if (p == NULL) { maparg->last_tag_end_position = taginfo->end_position; return NULL; } maparg->last_tag_end_position = p + 1; return p; } }
/* return NULL -- error,html page parser is end. return (taginfo->end_position - 1) -- normal. return (skip_close_tag_end_position - 1) -- html parse goto skip_close_tag_end_position - 1. arg->text_buffer_p->reserve1 //为1:表示该buffer最后一个字节是空格字符 */ const char * extract_text_mapfun(struct taginfo *taginfo, void *maparg) { const char *p; spec_tag_t *spec_tag = NULL; map_arg_t *arg = (map_arg_t *)maparg; /* process between last tag and this tag text */ /* <last tag>text<this tag> */ /* ^ ^ */ /* src_beg src_end */ { const char *src_beg, *src_end; char *dst; int i, char_cnt; if (arg->last_tag_end_position == NULL) src_beg = taginfo->start_position; else src_beg = arg->last_tag_end_position; src_end = taginfo->start_position; dst = arg->text_buffer_p->p + arg->text_buffer_p->pos; char_cnt = src_end - src_beg; for (i = 0;i < char_cnt;i++) { /* whitespace char */ if (*(src_beg + i) < ' ' && *(src_beg + i) >= 0) continue; // xssfilter时标签之间的特殊字符需要encode if(arg->xssfilter && (*(src_beg + i) == '<' || *(src_beg + i) == '>')) { // < > if (arg->text_buffer_p->free <= 5) { arg->last_tag_end_position = taginfo->end_position; return NULL; /* html page parser is end */ } if(*(src_beg + i) == '<') memcpy(dst,"<",4); if(*(src_beg + i) == '>') memcpy(dst,">",4); dst += 4; arg->text_buffer_p->reserve1 = 0; arg->text_buffer_p->pos += 4; arg->text_buffer_p->free -= 4; if (arg->text_buffer_p->free <= 1) { arg->last_tag_end_position = taginfo->end_position; return NULL; /* html page parser is end */ } continue; } /* space */ if (*(src_beg + i) == ' ') { if (arg->text_buffer_p->reserve1 == 1) continue; *dst = ' '; arg->text_buffer_p->reserve1 = 1; goto dst_add; } /* decode entities only*/ if (!arg->xssfilter && *(src_beg + i) == '&' && *(src_beg + i + 1) == 'n' && *(src_beg + i + 2) == 'b' && *(src_beg + i + 3) == 's' && *(src_beg + i + 4) == 'p' && *(src_beg + i + 5) == ';') { const char *tp = src_beg + i; int ret; ret = decode_entity(&tp, src_end); if (ret != -1) { if (ret == ' ') { if (arg->text_buffer_p->reserve1 == 1) { /* */ /* ^ ^tp */ i = tp - src_beg - 1; continue; } *dst = ' '; arg->text_buffer_p->reserve1 = 1; i = tp - src_beg - 1; goto dst_add; } *dst = ret; arg->text_buffer_p->reserve1 = 0; i = tp - src_beg - 1; goto dst_add; } } if (*((unsigned char*)(src_beg + i)) < 0x80) { *dst = *(src_beg + i); arg->text_buffer_p->reserve1 = 0; goto dst_add; } else //multi-byte char { int chlen,cn; if(arg->codetype == CODETYPE_GBK) chlen = GBKCHLEN((unsigned char)src_beg[i]); else if(arg->codetype == CODETYPE_UTF8) chlen = UTF8CHLEN((unsigned char)src_beg[i]); else chlen = 2; if (i + chlen - 1 < char_cnt) //防止*(src_beg+i+1)= '<' { //GBK中中文空格为A1A1,UTF-8中中文空格为E38080 if (arg->is_reserve_indent == 0 && ( \ (arg->codetype == CODETYPE_GBK && chlen == 2 && *(src_beg + i) == '\xa1' && *(src_beg + i + 1) == '\xa1')\ ||(arg->codetype == CODETYPE_UTF8 && chlen == 3 \ && *(src_beg + i) == '\xe3' && *(src_beg + i + 1) == '\x80' && *(src_beg + i + 2) == '\x80') \ )) { if (arg->text_buffer_p->reserve1 == 1) { i += (chlen-1); continue; } else { *dst = ' '; arg->text_buffer_p->reserve1 = 1; i += (chlen -1); goto dst_add; } } else { for(cn=0;cn<chlen;++cn) { *dst = *(src_beg + i + cn); ++dst; arg->text_buffer_p->pos++; arg->text_buffer_p->free--; if (arg->text_buffer_p->free <= 1) { arg->last_tag_end_position = taginfo->end_position; return NULL; /* html page parser is end */ } } arg->text_buffer_p->reserve1 = 0; i += (chlen -1); continue; } } else { //this multi byte char error. break; } } dst_add: ++dst; arg->text_buffer_p->pos++; arg->text_buffer_p->free--; if (arg->text_buffer_p->free <= 1) { arg->last_tag_end_position = taginfo->end_position; return NULL; /* html page parser is end */ } } } /* end of process between last tag and this tag text */ /* comment tag or declare tag or close tag ?*/ if (taginfo->end_tag_p == DECLARE_TAG /*|| taginfo->end_tag_p == CLOSE_TAG*/) { arg->last_tag_end_position = taginfo->end_position; return (taginfo->end_position - 1); } /* this tag is special tag ? */ spec_tag = hash_table_get(arg->spec_tag_ht, taginfo->name); if (!spec_tag) /* not a speical tag */ { arg->last_tag_end_position = taginfo->end_position; return (taginfo->end_position - 1); } switch (spec_tag->type) { case SKIP_TAG: if (taginfo->end_tag_p == CLOSE_TAG) { arg->last_tag_end_position = taginfo->end_position; return (taginfo->end_position - 1); } if (spec_tag->handler == NULL) { /* process block tag */ /* <block >....block....</blcok> */ /* ^ ^ ^ */ /* block_beg block_end p */ const char *block_beg, *block_end; block_beg = taginfo->end_position; block_end = stristrb(block_beg, arg->html_content_end_position - block_beg, spec_tag->end_str, strlen(spec_tag->end_str)); if (block_end == NULL) { arg->last_tag_end_position = taginfo->end_position; return (taginfo->end_position - 1); } else { p = memchr(block_end, '>', arg->html_content_end_position - block_end); if (p == NULL) { arg->last_tag_end_position = taginfo->end_position; return NULL; } arg->last_tag_end_position = p + 1; return p; } } else { return spec_tag->handler(taginfo, arg, spec_tag); } case RESERVE_TAG: if (spec_tag->handler == NULL) { int tag_len,k, blen, bpos;; const char *tag_ptr; char tagbuf[1024]; tag_attr_t * tagattr; // 保留标签,但需要过滤掉属性,只保留指定的属性,added @2011.4.10 if(taginfo->end_tag_p == START_TAG) { tagbuf[0] = 0; bpos = 0; blen = snprintf(tagbuf+bpos,sizeof(tagbuf)-bpos,"<%s",taginfo->name); bpos += blen; for(k=0;k<taginfo->nattrs;k++) { tagattr = hash_table_get(arg->tag_attr_ht, taginfo->attrs[k].name); if(tagattr) { do { if(strcasecmp(taginfo->name, tagattr->tag_name) == 0) break; else tagattr=tagattr->next; } while(tagattr); if(!tagattr) continue; if(arg->xssfilter) { if( (strcasecmp(taginfo->name,"img") == 0 && strcasecmp(taginfo->attrs[k].name, "src") == 0) ||(strcasecmp(taginfo->name,"a") == 0 && strcasecmp(taginfo->attrs[k].name, "href") == 0)) { if( taginfo->attrs[k].value \ && !strcasebeginwith("http://",taginfo->attrs[k].value) \ && !strcasebeginwith("https://",taginfo->attrs[k].value) ) continue; } } if(taginfo->attrs[k].value) blen = snprintf(tagbuf+bpos,sizeof(tagbuf)-bpos," %s=\"%s\"",taginfo->attrs[k].name,taginfo->attrs[k].value); else blen = snprintf(tagbuf+bpos,sizeof(tagbuf)-bpos," %s=\"%s\"",taginfo->attrs[k].name,taginfo->attrs[k].name); if(blen < 0 || blen >= sizeof(tagbuf)-bpos) { arg->last_tag_end_position = taginfo->end_position; return NULL; } bpos += blen; } } if(strcasecmp(taginfo->name,"br") == 0 || strcasecmp(taginfo->name,"img") == 0) blen = snprintf(tagbuf + bpos, sizeof(tagbuf) - bpos, " />"); else if(arg->xssfilter && strcasecmp(taginfo->name,"a") == 0) blen = snprintf(tagbuf + bpos, sizeof(tagbuf) - bpos, " target=\"_blank\" >"); else blen = snprintf(tagbuf + bpos, sizeof(tagbuf) - bpos, " >"); bpos += blen; tag_len = bpos; tag_ptr = tagbuf; } else { tag_len = taginfo->end_position - taginfo->start_position; tag_ptr = taginfo->start_position; } if (arg->text_buffer_p->free >= tag_len) { memcpy(arg->text_buffer_p->p + arg->text_buffer_p->pos, tag_ptr, tag_len); arg->text_buffer_p->pos += tag_len; arg->text_buffer_p->free -= tag_len; arg->last_tag_end_position = taginfo->end_position; return (taginfo->end_position - 1); } else { arg->last_tag_end_position = taginfo->end_position; return NULL; } } else { return spec_tag->handler(taginfo, arg, spec_tag); } case EXTRACT_TAG: if (taginfo->end_tag_p == CLOSE_TAG) { arg->last_tag_end_position = taginfo->end_position; return (taginfo->end_position - 1); } if (spec_tag->handler == NULL) { arg->last_tag_end_position = taginfo->end_position; return (taginfo->end_position - 1); } else { return spec_tag->handler(taginfo, arg, spec_tag); } default: arg->last_tag_end_position = taginfo->end_position; return (taginfo->end_position - 1); } }