Ejemplo n.º 1
0
/* <BASE HREF="http://msdn.microsoft.com/workshop/author/dhtml/reference/"/>
   <a href=""> text </a>
   <img src="" >

   return:buffer:Url\nUrl\...\n
*/
int extract_interesting_tag_attr(char *html_content, int content_len,
                                 char *interesting_tag, char *interesting_tag_attr,
                                 buffer_t *interesting_tag_attr_buf_p, int count)
{
  struct interesting_tag_attr maparg;
  struct hash_table * interesting_tag_ht, *interesting_attr_ht;

  if (count == 0)
    return -1;

  interesting_tag_ht = make_nocase_string_hash_table(1);
  interesting_attr_ht = make_nocase_string_hash_table(1);
  hash_table_put(interesting_tag_ht, interesting_tag, interesting_tag_attr);
  hash_table_put(interesting_attr_ht, interesting_tag_attr, interesting_tag);

  maparg.interesting_tag_attr = interesting_tag_attr;
  maparg.interesting_tag_attr_buf_p = interesting_tag_attr_buf_p;
  maparg.is_ok = 0;
  maparg.count = count;

  map_html_tags(html_content, content_len, extract_interesting_tag_attr_mapfun, &maparg, MHT_TRIM_VALUES, interesting_tag_ht, interesting_attr_ht);

  if (interesting_tag_ht)
    hash_table_destroy(interesting_tag_ht);

  if (interesting_attr_ht)
    hash_table_destroy(interesting_attr_ht);

  if (maparg.is_ok)
    return 0;
  else
    return -1;
}
Ejemplo n.º 2
0
struct urlpos *
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
               struct iri *iri)
{
  struct file_memory *fm;
  struct map_context ctx;
  int flags;

  /* Load the file. */
  fm = wget_read_file (file);
  if (!fm)
    {
      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
      return NULL;
    }
  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));

  ctx.text = fm->content;
  ctx.head = NULL;
  ctx.base = NULL;
  ctx.parent_base = url ? url : opt.base_href;
  ctx.document_file = file;
  ctx.nofollow = false;

  if (!interesting_tags)
    init_interesting ();

  /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
     generate <a href=" foo"> instead of <a href="foo"> (browsers
     ignore spaces as well.)  If you really mean space, use &32; or
     %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
     e.g. in <img src="foo.[newline]html">.  Such newlines are also
     ignored by IE and Mozilla and are presumably introduced by
     writing HTML with editors that force word wrap.  */
  flags = MHT_TRIM_VALUES;
  if (opt.strict_comments)
    flags |= MHT_STRICT_COMMENTS;

  /* the NULL here used to be interesting_tags */
  map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
                 NULL, interesting_attributes);

  /* If meta charset isn't null, override content encoding */
  if (iri && meta_charset)
    set_content_encoding (iri, meta_charset);

  DEBUGP (("no-follow in %s: %d\n", file, ctx.nofollow));
  if (meta_disallow_follow)
    *meta_disallow_follow = ctx.nofollow;

  xfree_null (ctx.base);
  wget_read_file_free (fm);
  return ctx.head;
}
Ejemplo n.º 3
0
/*
  pure text, text <tag>,<tag> text
*/
void extract_text(const char *html_content, int content_len, buffer_t *text_buffer_p, const struct hash_table *spec_tag_ht, const struct hash_table *tag_attr_ht, int is_reserve_indent,int codetype,int xssfilter)
{
  int len;
  map_arg_t maparg;
  maparg.html_content_end_position = html_content + content_len;
  maparg.last_tag_end_position = html_content;/* text <tag> */
  maparg.spec_tag_ht = spec_tag_ht;
  maparg.tag_attr_ht = tag_attr_ht;
  maparg.text_buffer_p = text_buffer_p;
  maparg.text_buffer_p->reserve1 = 1;
  maparg.is_reserve_indent = is_reserve_indent;
  maparg.codetype = codetype;
  maparg.xssfilter = xssfilter;

  //注意: 感兴趣标签列表为NULL,表示对所有标签都感兴趣
  map_html_tags(html_content, content_len, extract_text_mapfun, &maparg, MHT_TRIM_VALUES, NULL, tag_attr_ht);

  /* pure text */
  if (maparg.last_tag_end_position == html_content && text_buffer_p->free >= content_len)
  {
    memcpy(text_buffer_p->p + text_buffer_p->pos, html_content, content_len);
    text_buffer_p->free -= content_len;
    text_buffer_p->pos += content_len;
    return;
  }

  /* <tag> text */
  len = maparg.html_content_end_position - maparg.last_tag_end_position;
  if (maparg.last_tag_end_position != maparg.html_content_end_position && text_buffer_p->free >= len)
  {
    memcpy(text_buffer_p->p + text_buffer_p->pos, maparg.last_tag_end_position, len);
    text_buffer_p->free -= len;
    text_buffer_p->pos += len;
  }
  return;
}