Пример #1
0
/* <BASE HREF="http://msdn.microsoft.com/workshop/author/dhtml/reference/"/>
   <a href=""> text </a>
   <img src="" >

   return:buffer:Url\nUrl\...\n
*/
int extract_interesting_tag_attr(char *html_content, int content_len,
                                 char *interesting_tag, char *interesting_tag_attr,
                                 buffer_t *interesting_tag_attr_buf_p, int count)
{
  struct interesting_tag_attr maparg;
  struct hash_table * interesting_tag_ht, *interesting_attr_ht;

  if (count == 0)
    return -1;

  interesting_tag_ht = make_nocase_string_hash_table(1);
  interesting_attr_ht = make_nocase_string_hash_table(1);
  hash_table_put(interesting_tag_ht, interesting_tag, interesting_tag_attr);
  hash_table_put(interesting_attr_ht, interesting_tag_attr, interesting_tag);

  maparg.interesting_tag_attr = interesting_tag_attr;
  maparg.interesting_tag_attr_buf_p = interesting_tag_attr_buf_p;
  maparg.is_ok = 0;
  maparg.count = count;

  map_html_tags(html_content, content_len, extract_interesting_tag_attr_mapfun, &maparg, MHT_TRIM_VALUES, interesting_tag_ht, interesting_attr_ht);

  if (interesting_tag_ht)
    hash_table_destroy(interesting_tag_ht);

  if (interesting_attr_ht)
    hash_table_destroy(interesting_attr_ht);

  if (maparg.is_ok)
    return 0;
  else
    return -1;
}
Пример #2
0
// tag_attr_list不能为空,为空表示所有属性解析过程中都会返回,可以杜撰一个不存在的属性。
void extract_text_init(spec_tag_t *spec_tag_list, int tag_list_size, struct hash_table ** spec_tag_ht_pp,
                       tag_attr_t *tag_attr_list, int attr_list_size, struct hash_table ** tag_attr_ht_pp)
{
  int i;
  struct hash_table * ht;
  tag_attr_t * tagattr;

  ht = make_nocase_string_hash_table(tag_list_size);
  for (i = 0; i < tag_list_size; i++)
    hash_table_put(ht, spec_tag_list[i].tag_name, spec_tag_list + i);
  *spec_tag_ht_pp = ht;

  ht = make_nocase_string_hash_table(attr_list_size);
  for (i = 0; i < attr_list_size; i++) {
     // 先检查属性是否存在...
     tagattr = hash_table_get(ht,tag_attr_list[i].tag_attr);
     if(!tagattr) {
       // 不存在,插入为首节点
       tag_attr_list[i].tail = tag_attr_list + i;
       hash_table_put(ht, tag_attr_list[i].tag_attr, tag_attr_list + i);
     } else {
       // 存在,插入到链表的末尾
       tagattr->tail->next = tag_attr_list + i;
       tagattr->tail = tag_attr_list + i;
     }
   }
  *tag_attr_ht_pp = ht;
}
Пример #3
0
static void
init_interesting (void)
{
  /* Init the variables interesting_tags and interesting_attributes
     that are used by the HTML parser to know which tags and
     attributes we're interested in.  We initialize this only once,
     for performance reasons.

     Here we also make sure that what we put in interesting_tags
     matches the user's preferences as specified through --ignore-tags
     and --follow-tags.  */

  int i;
  interesting_tags = make_nocase_string_hash_table (countof (known_tags));

  /* First, add all the tags we know hot to handle, mapped to their
     respective entries in known_tags.  */
  for (i = 0; i < countof (known_tags); i++)
    hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);

  /* Then remove the tags ignored through --ignore-tags.  */
  if (opt.ignore_tags)
    {
      char **ignored;
      for (ignored = opt.ignore_tags; *ignored; ignored++)
        hash_table_remove (interesting_tags, *ignored);
    }

  /* If --follow-tags is specified, use only those tags.  */
  if (opt.follow_tags)
    {
      /* Create a new table intersecting --follow-tags and known_tags,
         and use it as interesting_tags.  */
      struct hash_table *intersect = make_nocase_string_hash_table (0);
      char **followed;
      for (followed = opt.follow_tags; *followed; followed++)
        {
          struct known_tag *t = hash_table_get (interesting_tags, *followed);
          if (!t)
            continue;           /* ignore unknown --follow-tags entries. */
          hash_table_put (intersect, *followed, t);
        }
      hash_table_destroy (interesting_tags);
      interesting_tags = intersect;
    }

  /* Add the attributes we care about. */
  interesting_attributes = make_nocase_string_hash_table (10);
  for (i = 0; i < countof (additional_attributes); i++)
    hash_table_put (interesting_attributes, additional_attributes[i], "1");
  for (i = 0; i < countof (tag_url_attributes); i++)
    hash_table_put (interesting_attributes,
                    tag_url_attributes[i].attr_name, "1");
}
Пример #4
0
static void
cache_store (const char *host, struct address_list *al)
{
  if (!host_name_addresses_map)
    host_name_addresses_map = make_nocase_string_hash_table (0);

  ++al->refcount;
  hash_table_put (host_name_addresses_map, xstrdup_lower (host), al);

  IF_DEBUG
    {
      int i;
      debug_logprintf ("Caching %s =>", host);
      for (i = 0; i < al->count; i++)
        debug_logprintf (" %s", print_address (al->addresses + i));
      debug_logprintf ("\n");
    }
}
Пример #5
0
static void
cache_host_lookup (const char *host, struct address_list *al)
{
  if (!host_name_addresses_map)
    host_name_addresses_map = make_nocase_string_hash_table (0);

  ++al->refcount;
  hash_table_put (host_name_addresses_map, xstrdup_lower (host), al);

#ifdef ENABLE_DEBUG
  if (opt.debug)
    {
      int i;
      debug_logprintf ("Caching %s =>", host);
      for (i = 0; i < al->count; i++)
	debug_logprintf (" %s", pretty_print_address (al->addresses + i));
      debug_logprintf ("\n");
    }
#endif
}
Пример #6
0
void
res_register_specs (const char *host, int port, struct robot_specs *specs)
{
  struct robot_specs *old;
  char *hp, *hp_old;
  SET_HOSTPORT (host, port, hp);

  if (!registered_specs)
    registered_specs = make_nocase_string_hash_table (0);

  if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
    {
      if (old)
        free_specs (old);
      hash_table_put (registered_specs, hp_old, specs);
    }
  else
    {
      hash_table_put (registered_specs, xstrdup (hp), specs);
    }
}
Пример #7
0
static void
store_cookie (struct cookie *cookie)
{
  struct cookie *chain_head;
  char *hostport;
  char *chain_key;

  if (!cookies_hash_table)
    /* If the hash table is not initialized, do so now, because we'll
       need to store things.  */
    cookies_hash_table = make_nocase_string_hash_table (0);

  /* Initialize hash table key.  */
  SET_HOSTPORT (cookie->domain, cookie->port, hostport);

  if (hash_table_get_pair (cookies_hash_table, hostport,
			   &chain_key, &chain_head))
    {
      /* There already exists a chain of cookies with this exact
         domain.  We need to check for duplicates -- if an existing
         cookie exactly matches our domain, path and name, we replace
         it.  */
      struct cookie *prev;
      struct cookie *victim = find_matching_cookie (cookie, &prev);

      if (victim)
	{
	  /* Remove VICTIM from the chain.  COOKIE will be placed at
	     the head. */
	  if (prev)
	    {
	      prev->next = victim->next;
	      cookie->next = chain_head;
	    }
	  else
	    {
	      /* prev is NULL; apparently VICTIM was at the head of
		 the chain.  This place will be taken by COOKIE, so
		 all we need to do is:  */
	      cookie->next = victim->next;
	    }
	  delete_cookie (victim);
	  DEBUGP (("Deleted old cookie (to be replaced.)\n"));
	}
      else
	cookie->next = chain_head;
    }
  else
    {
      /* We are now creating the chain.  Allocate the string that will
	 be used as a key.  It is unsafe to use cookie->domain for
	 that, because it might get deallocated by the above code at
	 some point later.  */
      cookie->next = NULL;
      chain_key = xstrdup (hostport);
    }

  hash_table_put (cookies_hash_table, chain_key, cookie);

  DEBUGP (("\nStored cookie %s %d %s %s %d %s %s %s\n",
	   cookie->domain, cookie->port, cookie->path,
	   cookie->permanent ? "permanent" : "nonpermanent",
	   cookie->secure,
	   asctime (localtime ((time_t *)&cookie->expiry_time)),
	   cookie->attr, cookie->value));
}