Example #1
0
static unsigned int
get_unicode( str *s, unsigned int *pi, int charsetin, int latexin, int utf8in, int xmlin )
{
	unsigned int ch;
	int unicode = 0, err = 0;
	if ( xmlin && s->data[*pi]=='&' ) {
		ch = decode_entity( s->data, pi, &unicode, &err );
	} else if ( charsetin==CHARSET_GB18030 ) {
		ch = gb18030_decode( s->data, pi );
		unicode = 1;
	} else if ( latexin ) {
		/* Must handle bibtex files in UTF8/Unicode */
		if ( utf8in && ( s->data[*pi] & 128 ) ) {
			ch = utf8_decode( s->data, pi );
			unicode = 1;
		} else ch = latex2char( s->data, pi, &unicode );
	}
	else if ( utf8in )
		ch = utf8_decode( s->data, pi );
	else {
		ch = (unsigned int) s->data[*pi];
		*pi = *pi + 1;
	}
	if ( !unicode && charsetin!=CHARSET_UNICODE )
		ch = charset_lookupchar( charsetin, ch );
	return ch;
}
Example #2
0
static
XMLSTRING make_unescaped_string(char *p, char *s)
{
  XMLSTRING buf = xmls_new(s-p);
  while(p<s) {
    char c = *p++;
    if(c=='&') {
      decode_entity(&p, buf);
    } else {
      xmls_add_char(buf, c);
    }
  }
  return buf;
}
Example #3
0
int hts_unescapeEntitiesWithCharset(const char *src, char *dest, const size_t max, const char *charset) {
  size_t i, j, ampStart, ampStartDest;
  int uc;
  int hex;
  unsigned int hash;

  assert(max != 0);
  for(i = 0, j = 0, ampStart = (size_t) -1, ampStartDest = 0,
        uc = -1, hex = 0, hash = 0 ; src[i] != '\0' ; i++) {
    /* start of entity */
    if (src[i] == '&') {
      ampStart = i;
      ampStartDest = j;
      hash = 0;
      uc = -1;
    }
    /* inside a potential entity */
    else if (ampStart != (size_t) -1) {
      /* &#..; entity */
      if (ampStart + 1 == i && src[ampStart + 1] == '#') {
        uc = 0;
        hex = 0;
      }
      /* &#x..; entity */
      else if (ampStart + 2 == i && src[ampStart + 1] == '#'
               && src[ampStart + 2] == 'x') {
        hex = 1;
      }
      /* end of entity */
      else if (src[i] == ';') {
        size_t len;
        
        /* decode entity */
        if (uc == -1) {
          /* &foo; */
          uc = decode_entity(hash, /*&src[ampStart + 1],*/
                             i - ampStart - 1);
          /* FIXME: TEMPORARY HACK FROM PREVIOUS VERSION TO BE INVESTIGATED */
          if (uc == 160) {
            uc = 32;
          }
        }
        
        /* end */
        ampStart = (size_t) -1;
        
        /* success ? */
        if (uc > 0) {
          const size_t maxOut = max - ampStartDest;
          /* write at position */
          if (charset != NULL && hts_isCharsetUTF8(charset)) {
            len = hts_writeUTF8(uc, &dest[ampStartDest], maxOut);
          } else {
            size_t ulen;
            char buffer[32];
            len = 0;
            if ( ( ulen = hts_writeUTF8(uc, buffer, sizeof(buffer)) ) != 0) {
              char *s;
              buffer[ulen] = '\0';
              s = hts_convertStringFromUTF8(buffer, strlen(buffer), charset);
              if (s != NULL) {
                const size_t sLen = strlen(s);
                if (sLen < maxOut) {
                  /* Do not copy \0. */
                  memcpy(&dest[ampStartDest], s, sLen);
                  len = sLen;
                }
                free(s);
              }
            }
          }
          if (len > 0) {
            /* new dest position */
            j = ampStartDest + len;
            /* do not copy ; */
            continue;
          }
        }
      }
      /* numerical entity */
      else if (uc != -1) {
        /* decimal */
        if (!hex) {
          if (src[i] >= '0' && src[i] <= '9') {
            const int h = src[i] - '0';
            uc *= 10;
            uc += h;
          } else {
            /* abandon */
            ampStart = (size_t) -1;
          }
        }
        /* hex */
        else {
          const int h = get_hex_value(src[i]);
          if (h != -1) {
            uc *= 16;
            uc += h;
          } else {
            /* abandon */
            ampStart = (size_t) -1;
          }
        }
      }
      /* alphanumerical entity */
      else {
        /* alphanum and not too far ('&thetasym;' is the longest) */
        if (i <= ampStart + 10 &&
            (
             (src[i] >= '0' && src[i] <= '9')
             || (src[i] >= 'A' && src[i] <= 'Z')
             || (src[i] >= 'a' && src[i] <= 'z')
             )
            ) {
          /* compute hash */
          HASH_ADD(hash, (unsigned char) src[i]);
        } else {
          /* abandon */
          ampStart = (size_t) -1;
        }
      }
    }
    
    /* copy */
    if (j + 1 > max) {
      /* overflow */
      return -1;
    }
    if (src != dest || i != j) {
      dest[j] = src[i];
    }
    j++;
  }
  dest[j] = '\0';

  return 0;
}
Example #4
0
const char *tag_handler_anchor(struct taginfo *taginfo, map_arg_t* maparg, spec_tag_t *spec_tag)
{
  const char *block_beg, *block_end, *p;
  buffer_t *anchor_text_buffer = spec_tag->handler_arg;

  block_beg = taginfo->end_position;
  block_end = stristrb(block_beg, maparg->html_content_end_position - block_beg, spec_tag->end_str, strlen(spec_tag->end_str));
  if (block_end == NULL)
  {
    maparg->last_tag_end_position = taginfo->end_position;
    return (taginfo->end_position - 1);
  }
  else
  {
    /* copy anchor text to anchor_text_buffer */
    int n = 0;
    char * ank_dst;
    ank_dst = anchor_text_buffer->p + anchor_text_buffer->pos;
    while (block_beg + n != block_end)
    {
      /* whitespace char */
      if (*(block_beg + n) < ' ' && *(block_beg + n) >= 0)
      {
        n++;
        continue;
      }

      //space
      if (*(block_beg + n) == ' ')
      {
        if (anchor_text_buffer->reserve1 == 1)
        {
          n++;
          continue;
        }
        else
        {
          *ank_dst = ' ';
          anchor_text_buffer->reserve1 = 1;
          n++;
          goto ank_dst_add;
        }
      }

      /* decode entities &nbsp; only */
      if (*(block_beg + n) == '&' && *(block_beg + n + 1) == 'n' &&
          *(block_beg + n + 2) == 'b' && *(block_beg + n + 3) == 's' && 
          *(block_beg + n + 4) == 'p' && *(block_beg + n + 5) == ';')
      {
        const char *tp = block_beg + n;
        int ret;
        ret = decode_entity(&tp, block_end);
        if (ret != -1)
        {
          if (ret == ' ')
          {
            if (anchor_text_buffer->reserve1 == 1)
            {
              /* &nbsp; */
              /* ^     ^tp */
              n = tp - block_beg;
              continue;
            }
            *ank_dst = ' ';
            anchor_text_buffer->reserve1 = 1;
            n = tp - block_beg;
            goto ank_dst_add;
          }

          *ank_dst = ret;
          anchor_text_buffer->reserve1 = 0;
          n = tp - block_beg;
          goto ank_dst_add;
        }
      }

      /* skip html tag between <a>...</a> */
      if (*(block_beg + n) == '<')
      {
        char *close_tag;
        close_tag = (char *)memchr(block_beg + n + 1, '>', block_end - (block_beg + n + 1));
        if (close_tag == NULL) /* error html tag,skip this anchor tag */
        {
          p = memchr(block_end, '>', maparg->html_content_end_position - block_end);
          if (p == NULL)
          {
            maparg->last_tag_end_position = taginfo->end_position;
            return NULL;
          }
          maparg->last_tag_end_position = p + 1;
          return p;
        }

        n += (close_tag - (block_beg + n) + 1);
        continue;
      }

      /* ascii char */
      if (*((unsigned char *)(block_beg + n)) < 0x80)
      {
        *ank_dst = *(block_beg + n);
        anchor_text_buffer->reserve1 = 0;
        n++;
        goto ank_dst_add;
      }
      else  /* multi-byte char */
      {
        int chlen,cn;

        if(maparg->codetype == CODETYPE_GBK)
          chlen = GBKCHLEN((unsigned char)block_beg[n]);
        else if(maparg->codetype == CODETYPE_UTF8)
          chlen = UTF8CHLEN((unsigned char)block_beg[n]);
        else
          chlen = 2;

        if (block_beg + n + chlen <= block_end) {
          //GBK中中文空格为A1A1,UTF-8中中文空格为E38080
          if (maparg->is_reserve_indent == 0 && ( \
            (maparg->codetype == CODETYPE_GBK  && chlen == 2 && *(block_beg + n) == '\xa1' && *(block_beg + n + 1) == '\xa1')\
          ||(maparg->codetype == CODETYPE_UTF8 && chlen == 3 \
            && *(block_beg + n) == '\xe3' && *(block_beg + n + 1) == '\x80' && *(block_beg + n + 2) == '\x80')))
          {
              if (anchor_text_buffer->reserve1 == 1)
              {
                n += chlen;
                continue;
              }
              else
              {
                *ank_dst = ' ';
                anchor_text_buffer->reserve1 = 1;
                n += chlen;
                goto ank_dst_add;
              }
          } else {
            for(cn=0;cn<chlen;++cn) {
              *ank_dst = *(block_beg + n + cn);
              ++ank_dst;
              anchor_text_buffer->pos++;
              anchor_text_buffer->free--;
              if (anchor_text_buffer->free <= 1)
              {
                maparg->last_tag_end_position = taginfo->end_position;
                return NULL; /* html page parser is end */
              }
            }
            
            anchor_text_buffer->reserve1 = 0;
            n += chlen;
            continue;
          }
        } else { //this multi byte char error.
          break;
        }
      }

    ank_dst_add:
      ++ank_dst;
      anchor_text_buffer->pos++;
      anchor_text_buffer->free--;
      if (anchor_text_buffer->free <= 1)
      {
        maparg->last_tag_end_position = taginfo->end_position;
        return NULL; /* html page parser is end */
      }
    }/* end for while */

    if (anchor_text_buffer->reserve1 == 0)
    {
      *ank_dst = ' ';
      anchor_text_buffer->reserve1 = 1;
      ++ank_dst;
      anchor_text_buffer->pos++;
      anchor_text_buffer->free--;
      if (anchor_text_buffer->free == 0)
      {
        maparg->last_tag_end_position = taginfo->end_position;
        return NULL; /* html page parser is end */
      }
    }

    p = memchr(block_end, '>', maparg->html_content_end_position - block_end);
    if (p == NULL)
    {
      maparg->last_tag_end_position = taginfo->end_position;
      return NULL;
    }
    maparg->last_tag_end_position = p + 1;
    return p;
  }
}
Example #5
0
/* return NULL -- error,html page parser is end.
   return (taginfo->end_position - 1) -- normal.
   return (skip_close_tag_end_position - 1) -- html parse goto skip_close_tag_end_position - 1.

   arg->text_buffer_p->reserve1 //为1:表示该buffer最后一个字节是空格字符
 */
const char * extract_text_mapfun(struct taginfo *taginfo, void *maparg)
{
  const char *p;
  spec_tag_t *spec_tag = NULL;
  map_arg_t *arg = (map_arg_t *)maparg;

  /* process between last tag and this tag text */
  /*  <last tag>text<this tag> */
  /*            ^   ^          */
  /*        src_beg src_end    */
  {
    const char *src_beg, *src_end;
    char *dst;
    int i, char_cnt;

    if (arg->last_tag_end_position == NULL)
      src_beg = taginfo->start_position;
    else
      src_beg = arg->last_tag_end_position;

    src_end = taginfo->start_position;
    dst = arg->text_buffer_p->p + arg->text_buffer_p->pos;
    char_cnt = src_end - src_beg;

    for (i = 0;i < char_cnt;i++)
    {
      /* whitespace char */
      if (*(src_beg + i) < ' ' && *(src_beg + i) >= 0)
        continue;

      // xssfilter时标签之间的特殊字符需要encode
      if(arg->xssfilter && (*(src_beg + i) == '<' || *(src_beg + i) == '>')) { // &lt; &gt;
        if (arg->text_buffer_p->free <= 5) {
          arg->last_tag_end_position = taginfo->end_position;
          return NULL; /* html page parser is end */
        }
        if(*(src_beg + i) == '<') memcpy(dst,"&lt;",4);
        if(*(src_beg + i) == '>') memcpy(dst,"&gt;",4);
        dst += 4;
        arg->text_buffer_p->reserve1 = 0;
        arg->text_buffer_p->pos += 4;
        arg->text_buffer_p->free -= 4;
        if (arg->text_buffer_p->free <= 1) {
          arg->last_tag_end_position = taginfo->end_position;
          return NULL; /* html page parser is end */
        }
        continue;
      }

      /* space  */
      if (*(src_beg + i) == ' ') {
        if (arg->text_buffer_p->reserve1 == 1)
          continue;
        *dst = ' ';
        arg->text_buffer_p->reserve1 = 1;
        goto dst_add;
      }

      /* decode entities &nbsp; only*/
      if (!arg->xssfilter && *(src_beg + i) == '&' && *(src_beg + i + 1) == 'n' 
        && *(src_beg + i + 2) == 'b' && *(src_beg + i + 3) == 's' 
        && *(src_beg + i + 4) == 'p' && *(src_beg + i + 5) == ';')
      {
        const char *tp = src_beg + i;
        int ret;
        ret = decode_entity(&tp, src_end);
        if (ret != -1)
        {
          if (ret == ' ')
          {
            if (arg->text_buffer_p->reserve1 == 1)
            {
              /* &nbsp; */
              /* ^     ^tp */
              i = tp - src_beg - 1;
              continue;
            }
            *dst = ' ';
            arg->text_buffer_p->reserve1 = 1;
            i = tp - src_beg - 1;
            goto dst_add;
          }

          *dst = ret;
          arg->text_buffer_p->reserve1 = 0;
          i = tp - src_beg - 1;
          goto dst_add;
        }
      }

      if (*((unsigned char*)(src_beg + i)) < 0x80)
      {
        *dst = *(src_beg + i);
        arg->text_buffer_p->reserve1 = 0;
        goto dst_add;
      }
      else //multi-byte char
      {
        int chlen,cn;

        if(arg->codetype == CODETYPE_GBK)
          chlen = GBKCHLEN((unsigned char)src_beg[i]);
        else if(arg->codetype == CODETYPE_UTF8)
          chlen = UTF8CHLEN((unsigned char)src_beg[i]);
        else
          chlen = 2;

        if (i + chlen - 1 < char_cnt)  //防止*(src_beg+i+1)= '<'
        {
          //GBK中中文空格为A1A1,UTF-8中中文空格为E38080
          if (arg->is_reserve_indent == 0 && ( \
            (arg->codetype == CODETYPE_GBK  && chlen == 2 && *(src_beg + i) == '\xa1' && *(src_beg + i + 1) == '\xa1')\
          ||(arg->codetype == CODETYPE_UTF8 && chlen == 3 \
            && *(src_beg + i) == '\xe3' && *(src_beg + i + 1) == '\x80' && *(src_beg + i + 2) == '\x80') \
          )) {
            if (arg->text_buffer_p->reserve1 == 1) {
              i += (chlen-1);
              continue;
            } else {
              *dst = ' ';
              arg->text_buffer_p->reserve1 = 1;
              i += (chlen -1);
              goto dst_add;
            }
          } else {
            for(cn=0;cn<chlen;++cn) {
              *dst = *(src_beg + i + cn);
              ++dst;
              arg->text_buffer_p->pos++;
              arg->text_buffer_p->free--;
              if (arg->text_buffer_p->free <= 1) {
                arg->last_tag_end_position = taginfo->end_position;
                return NULL; /* html page parser is end */
              }
            }
            arg->text_buffer_p->reserve1 = 0;
            i += (chlen -1);
            continue;
          }
        } else { //this multi byte char error.
          break;
        }
      }

    dst_add:
      ++dst;
      arg->text_buffer_p->pos++;
      arg->text_buffer_p->free--;
      if (arg->text_buffer_p->free <= 1) {
        arg->last_tag_end_position = taginfo->end_position;
        return NULL; /* html page parser is end */
      }
    }
  } /* end of process between last tag and this tag text */

  /* comment tag or declare tag or  close tag ?*/
  if (taginfo->end_tag_p == DECLARE_TAG /*|| taginfo->end_tag_p == CLOSE_TAG*/)
  {
    arg->last_tag_end_position = taginfo->end_position;
    return (taginfo->end_position - 1);
  }

  /* this tag is special tag ? */
  spec_tag = hash_table_get(arg->spec_tag_ht, taginfo->name);
  if (!spec_tag)  /* not a speical tag */
  {
    arg->last_tag_end_position = taginfo->end_position;
    return (taginfo->end_position - 1);
  }

  switch (spec_tag->type)
  {
    case SKIP_TAG:
      if (taginfo->end_tag_p == CLOSE_TAG)
      {
        arg->last_tag_end_position = taginfo->end_position;
        return (taginfo->end_position - 1);
      }

      if (spec_tag->handler == NULL)
      {
        /* process block tag */
        /* <block >....block....</blcok> */
        /*         ^            ^      ^ */
        /*        block_beg    block_end p */
        const char *block_beg, *block_end;
        block_beg = taginfo->end_position;
        block_end = stristrb(block_beg, arg->html_content_end_position - block_beg, spec_tag->end_str, strlen(spec_tag->end_str));
        if (block_end == NULL)
        {
          arg->last_tag_end_position = taginfo->end_position;
          return (taginfo->end_position - 1);
        }
        else
        {
          p = memchr(block_end, '>', arg->html_content_end_position - block_end);
          if (p == NULL)
          {
            arg->last_tag_end_position = taginfo->end_position;
            return NULL;
          }

          arg->last_tag_end_position = p + 1;
          return p;
        }
      }
      else
      {
        return spec_tag->handler(taginfo, arg, spec_tag);
      }

    case RESERVE_TAG:
      if (spec_tag->handler == NULL)
      {
        int tag_len,k, blen, bpos;;
        const char *tag_ptr;
        char tagbuf[1024];
        tag_attr_t * tagattr;
        
        // 保留标签,但需要过滤掉属性,只保留指定的属性,added @2011.4.10
        if(taginfo->end_tag_p == START_TAG) {
          tagbuf[0] = 0;
          bpos = 0;
          blen = snprintf(tagbuf+bpos,sizeof(tagbuf)-bpos,"<%s",taginfo->name);
          bpos += blen;
          
          for(k=0;k<taginfo->nattrs;k++) {
            tagattr = hash_table_get(arg->tag_attr_ht, taginfo->attrs[k].name);
            if(tagattr) 
            {
              do {
                if(strcasecmp(taginfo->name, tagattr->tag_name) == 0)
                  break;
                else 
                  tagattr=tagattr->next;
              } while(tagattr);

              if(!tagattr)
                continue;

              if(arg->xssfilter) {
                if( (strcasecmp(taginfo->name,"img") == 0 && strcasecmp(taginfo->attrs[k].name, "src") == 0)
                  ||(strcasecmp(taginfo->name,"a") == 0 && strcasecmp(taginfo->attrs[k].name, "href") == 0)) {
                  if(  taginfo->attrs[k].value \
                    && !strcasebeginwith("http://",taginfo->attrs[k].value) \
                    && !strcasebeginwith("https://",taginfo->attrs[k].value) )
                    continue;
                }
              }
              
              if(taginfo->attrs[k].value)
                blen = snprintf(tagbuf+bpos,sizeof(tagbuf)-bpos," %s=\"%s\"",taginfo->attrs[k].name,taginfo->attrs[k].value);
              else
                blen = snprintf(tagbuf+bpos,sizeof(tagbuf)-bpos," %s=\"%s\"",taginfo->attrs[k].name,taginfo->attrs[k].name);

              if(blen < 0 || blen >= sizeof(tagbuf)-bpos) {
                arg->last_tag_end_position = taginfo->end_position;
                return NULL;
              }

              bpos += blen;
            }
          }

          if(strcasecmp(taginfo->name,"br") == 0 || strcasecmp(taginfo->name,"img") == 0)
            blen = snprintf(tagbuf + bpos, sizeof(tagbuf) - bpos, " />");
          else if(arg->xssfilter && strcasecmp(taginfo->name,"a") == 0) 
            blen = snprintf(tagbuf + bpos, sizeof(tagbuf) - bpos, " target=\"_blank\" >");
          else
            blen = snprintf(tagbuf + bpos, sizeof(tagbuf) - bpos, " >");
          bpos += blen;

          tag_len = bpos;
          tag_ptr = tagbuf;
        } else {
          tag_len = taginfo->end_position - taginfo->start_position;
          tag_ptr = taginfo->start_position;
        }

        if (arg->text_buffer_p->free >= tag_len) {
          memcpy(arg->text_buffer_p->p + arg->text_buffer_p->pos, tag_ptr, tag_len);
          arg->text_buffer_p->pos += tag_len;
          arg->text_buffer_p->free -= tag_len;
          arg->last_tag_end_position = taginfo->end_position;
          return (taginfo->end_position - 1);
        } else {
          arg->last_tag_end_position = taginfo->end_position;
          return NULL;
        }
      }
      else
      {
        return spec_tag->handler(taginfo, arg, spec_tag);
      }

    case EXTRACT_TAG:
      if (taginfo->end_tag_p == CLOSE_TAG)
      {
        arg->last_tag_end_position = taginfo->end_position;
        return (taginfo->end_position - 1);
      }

      if (spec_tag->handler == NULL)
      {
        arg->last_tag_end_position = taginfo->end_position;
        return (taginfo->end_position - 1);
      }
      else
      {
        return spec_tag->handler(taginfo, arg, spec_tag);
      }
    default:
      arg->last_tag_end_position = taginfo->end_position;
      return (taginfo->end_position - 1);
  }
}