Ejemplo n.º 1
0
static void
rss_read_copy (char *d, xml_node_t* n)
{
  const char *p = (const char *) xml_get_string (n);

  if (p)
    strncpy (d, p, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0;
  else
    *d = 0;
}
Ejemplo n.º 2
0
  bool pascalfull_dataset<Tdata>::process_xml(const string &xmlfile) {
    string image_filename;
    string image_fullname;
    string obj_classname;

    // parse xml file
    try {
      DomParser parser;
      //    parser.set_validate();
      parser.parse_file(xmlfile);
      if (parser) {
	// initialize root node and list
	const Node* pNode = parser.get_document()->get_root_node();
	Node::NodeList list = pNode->get_children();
	// get image filename
	for(Node::NodeList::iterator iter = list.begin();
	    iter != list.end(); ++iter) {
	  if (!strcmp((*iter)->get_name().c_str(), "filename")) {
	    xml_get_string(*iter, image_filename);
	    iter = list.end(); iter--; // stop loop
	  }
	}
	image_fullname = imgroot;
	image_fullname += image_filename;
	// parse all objects in image
	for(Node::NodeList::iterator iter = list.begin();
	    iter != list.end(); ++iter) {
	  if (!strcmp((*iter)->get_name().c_str(), "object")) {
	    Node::NodeList olist = (*iter)->get_children();
	    for(Node::NodeList::iterator oiter = olist.begin();
		oiter != olist.end(); ++oiter) {
	      if (!strcmp((*oiter)->get_name().c_str(), "name")) {
		xml_get_string(*oiter, obj_classname);
		// if object's name matches an excluded class, stop this xml
		if (find(exclude.begin(), exclude.end(),
			 obj_classname) != exclude.end())
		  return false;
	      }
	    }
	  }
	}
      }
    } catch (const std::exception& ex) {
      cerr << "error: Xml exception caught: " << ex.what() << endl;
      return false;
    } catch (const char *err) {
      cerr << "error: " << err << endl;
      return false;
    }
    // copy image into output directory
    ostringstream cmd;
    ostringstream tgt;
    tgt << outdir << "/" << image_filename;
    cmd << "cp " << image_fullname << " " << tgt.str();
    if (std::system(cmd.str().c_str()))
      cerr << "warning: failed to execute: " << cmd.str() << endl;
    else {
      cout << data_cnt << ": copied " << tgt.str() << endl;
      data_cnt++;
    }
    return true;
  }
Ejemplo n.º 3
0
static st_rss_t *
rss_open_atom (st_rss_t *rss, const char *encoding)
{
  xml_doc_t *doc;
  xml_node_t *node;
  const char *p = NULL;

  doc = xml_parse (rss->url, encoding);
  if (!doc)
    {
      fprintf (stderr, "ERROR: cannot read %s\n", rss->url);
      return NULL;
    }

  node = xml_get_rootnode (doc);
  if (!node)
    {
      fprintf (stderr, "ERROR: empty document %s\n", rss->url);
      xml_free (doc);
      return NULL;
    }

  node = xml_get_childnode (node);
  while (node && xml_is_empty_node (node))
    node = xml_get_nextnode (node);
  if (!node)
    {
//      fprintf (stderr, "");
      return NULL;
    }

  while (node)
    {
      while (node && xml_is_empty_node (node))
        node = xml_get_nextnode (node);

      if (!node)
        break;

      if (!strcasecmp (xml_get_name (node), "title"))
        rss_read_copy (rss->title, xml_get_childnode (node));
      else if (!strcasecmp (xml_get_name (node), "description"))
        rss_read_copy (rss->desc, xml_get_childnode (node));
//      else if (!strcasecmp (xml_get_name (node), "link"))
//        rss_read_copy (rss->url, xml_get_childnode (node));
      else if (!strcasecmp (xml_get_name (node), "date") ||
               !strcasecmp (xml_get_name (node), "pubDate") ||
               !strcasecmp (xml_get_name (node), "dc:date") ||
               !strcasecmp (xml_get_name (node), "modified") ||
               !strcasecmp (xml_get_name (node), "updated"))
        rss->date = strptime2 ((const char *) xml_get_string (xml_get_childnode (node)));
      else if ((!strcasecmp (xml_get_name (node), "entry")))
        {
          xml_node_t *pnode = xml_get_childnode (node);
          st_rss_item_t *item = &rss->item[rss->item_count];
//          int found = 0;
          char link[RSSMAXBUFSIZE];

          *link = 0;

          while (pnode)
            {
              while (pnode && xml_is_empty_node (pnode))
                pnode = xml_get_nextnode (pnode);

              if (!pnode)
                break;

#ifdef  DEBUG
              printf ("%s\n", xml_get_name (pnode));
              fflush (stdout);
#endif

              if (!strcasecmp (xml_get_name (pnode), "title"))
                {
                  rss_read_copy (item->title, xml_get_childnode (pnode));
//                  found = 1;
                }
#if 0
              else if (!strcasecmp (xml_get_name (pnode), "id"))
                {
                  rss_read_copy (link, xml_get_childnode (pnode));
//                  found = 1;
                }
#endif
              else if (!strcasecmp (xml_get_name (pnode), "link") && (!(*link)))
                {
#if 0
<link rel="alternate" type="text/html" href="http://edition.cnn.com/2006/POLITICS/11/01/kerry.remarks/"/>
#endif
                  p = (const char *) xml_get_value (pnode, "href");
                  if (p)
                    {
                      strncpy (link, p, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0;
//                      found = 1;
                    }
                }
              else if (!strcasecmp (xml_get_name (pnode), "content"))
                {
                  rss_read_copy (item->desc, xml_get_childnode (pnode));
//                  found = 1;
                }
              else if (!strcasecmp (xml_get_name (pnode), "author"))
                {
                  xml_node_t *tnode = xml_get_childnode (pnode); 
                  if (!strcasecmp (xml_get_name (tnode), "name"))
                    rss_read_copy (item->user, xml_get_childnode (tnode));
//                  found = 1;
                }
              else if (!strcasecmp (xml_get_name (pnode), "modified") ||
                       !strcasecmp (xml_get_name (pnode), "updated"))
                { 
                  item->date = strptime2 ((const char *) xml_get_string (xml_get_childnode (pnode)));
//                  found = 1;
                }
//              else if (!strcasecmp (xml_get_name (pnode), "duration")) // HACK yt:duration
              else if (stristr (xml_get_name (pnode), "duration")) // HACK yt:duration
                {
                  p = (const char *) xml_get_value (pnode, "seconds");
                  if (p)
                    {
                      item->media.duration = strtol (p, NULL, 10);
//                      found = 1;
//                      break;
                    }
                }
//              else if (!strcasecmp (xml_get_name (pnode), "group")) // media:group
              else if (stristr (xml_get_name (pnode), "group")) // media:group
#if 1
                {
                  rss_open_rss_mrss (pnode, item);
                }
#else
                {
                  xml_node_t *tnode = xml_get_childnode (pnode); 
                  while (tnode)
                    {
                      if (!tnode)
                        break;

//                      if (!strcasecmp (xml_get_name (tnode), "content")) // media:content
                      if (stristr (xml_get_name (tnode), "content")) // media:content
                        {
                          p = (const char *) xml_get_value (tnode, "duration");
                          if (p)
                            {
                              item->media.duration = strtol (p, NULL, 10);
//                              found = 1;
//                              break;
                            }
                          p = (const char *) xml_get_value (tnode, "filesize");
                          if (p)
                            {
                              item->media.filesize = strtol (p, NULL, 10);
//                              found = 1;
//                              break;
                            }
                          p = (const char *) xml_get_value (tnode, "width");
                          if (p)
                            {
                              item->media.width = strtol (p, NULL, 10);
//                              found = 1;
//                              break;
                            }
                          p = (const char *) xml_get_value (tnode, "height");
                          if (p)
                            {
                              item->media.height = strtol (p, NULL, 10);
//                              found = 1;
//                              break;
                            }
                        }
//                      else if (!strcasecmp (xml_get_name (tnode), "keywords")) // media:keywords
                      else if (stristr (xml_get_name (tnode), "keywords")) // media:keywords
                        {
                          rss_read_copy (item->media.keywords, xml_get_childnode (tnode));
                        }
//                      else if (!strcasecmp (xml_get_name (tnode), "thumbnail")) // media:thumbnail
                      else if (stristr (xml_get_name (tnode), "thumbnail")) // media:thumbnail
                        {
                          p = (const char *) xml_get_value (tnode, "url");
                          if (p)
                            if (!(item->media.thumbnail[0]))
                            {
                              strncpy (item->media.thumbnail, p, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0;
//                              found = 1;
//                              break;  
                            }
                        }
                      tnode = xml_get_nextnode (tnode);
                    }
                }
#endif
              pnode = xml_get_nextnode (pnode);
            }

          if (*link)
            strcpy (item->url, link);

          rss->item_count++;

          if (rss->item_count == RSSMAXITEM)
            break;
        }
Ejemplo n.º 4
0
static st_rss_t *
rss_open_rss (st_rss_t *rss, const char *encoding)
{
  xml_doc_t *doc;
  xml_node_t *node;
  int rdf = 0;

  doc = xml_parse (rss->url, encoding);
  if (!doc)
    {
      fprintf (stderr, "ERROR: cannot read %s\n", rss->url);
      return NULL;
    }

  node = xml_get_rootnode (doc);
  if (!node)
    {
      fprintf (stderr, "ERROR: empty document %s\n", rss->url);
      xml_free (doc);
      return NULL;
    }

  // rdf?
  // TODO: move this to rss_demux()
  if (strcasecmp (xml_get_name (node), "rss") != 0 &&
      !strcasecmp (xml_get_name (node), "rdf"))
    rdf = 1;

  node = xml_get_childnode (node);
  while (node && xml_is_empty_node (node))
    node = xml_get_nextnode (node);

  if (!node)
    {
//      fprintf (stderr, "");
      return NULL;
    }

  if (strcasecmp (xml_get_name (node), "channel"))
    {
      fprintf (stderr, "ERROR: bad document: did not immediately find the RSS element\n");
      return NULL;
    }

  if (!rdf) // document is RSS
    node = xml_get_childnode (node);

  while (node)
    {
      while (node && xml_is_empty_node (node))
        node = xml_get_nextnode (node);

      if (!node)
        break;

      if (!strcasecmp (xml_get_name (node), "title"))
        rss_read_copy (rss->title, xml_get_childnode (node));
      else if (!strcasecmp (xml_get_name (node), "description"))
        rss_read_copy (rss->desc, xml_get_childnode (node));
//      else if (!strcasecmp (xml_get_name (node), "link"))
//        rss_read_copy (rss->url, xml_get_childnode (node));
      else if (!strcasecmp (xml_get_name (node), "date") ||
               !strcasecmp (xml_get_name (node), "pubDate") ||
               !strcasecmp (xml_get_name (node), "dc:date"))
        rss->date = strptime2 ((const char *) xml_get_string (xml_get_childnode (node)));
      else if (!strcasecmp (xml_get_name (node), "channel") && rdf)
        {
          xml_node_t *pnode = xml_get_childnode (node);

          while (pnode)
            {
              if (!strcasecmp (xml_get_name (pnode), "title"))
                rss_read_copy (rss->title, xml_get_childnode (pnode));
              else if (!strcasecmp (xml_get_name (pnode), "description"))
                rss_read_copy (rss->desc, xml_get_childnode (pnode));
              else if (!strcasecmp (xml_get_name (pnode), "date") ||
                       !strcasecmp (xml_get_name (pnode), "pubDate") ||
                       !strcasecmp (xml_get_name (pnode), "dc:date"))
                rss->date = strptime2 ((const char *) xml_get_string (xml_get_childnode (pnode)));

              pnode = xml_get_nextnode (pnode);
            }

        }
      else if (!strcasecmp (xml_get_name (node), "item") || !strcasecmp (xml_get_name (node), "entry"))
        {
          xml_node_t *pnode = xml_get_childnode (node);
          st_rss_item_t *item = &rss->item[rss->item_count];
//          int found = 0;
          const char *p = NULL;
          char link[RSSMAXBUFSIZE], guid[RSSMAXBUFSIZE];

printf ("SHIT");
fflush (stdout);
          *link = *guid = 0;

          while (pnode)
            {
              while (pnode && xml_is_empty_node (pnode))
                pnode = xml_get_nextnode (pnode);

              if (!pnode)
                break;

#ifdef  DEBUG
              printf ("%s\n", xml_get_name (pnode));
              fflush (stdout);
#endif

              if (!strcasecmp (xml_get_name (pnode), "title"))
                {
                  rss_read_copy (item->title, xml_get_childnode (pnode));
//                  found = 1;
                }
              else if (!strcasecmp (xml_get_name (pnode), "link"))
                {
                  rss_read_copy (link, xml_get_childnode (pnode));
//                  found = 1;
                }
#if 0
              else if (!strcasecmp (xml_get_name (pnode), "enclosure"))
                {
                  p = (const char *) xml_get_value (pnode, "url");
                  if (p)
                    {
                      strncpy (link, p, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0;
//                      found = 1;
                    }
                }
#endif
              else if (!strcasecmp (xml_get_name (pnode), "guid") && (!(*link)))
                {
                  rss_read_copy (guid, xml_get_childnode (pnode));
//                  found = 1;
                }
              else if (!strcasecmp (xml_get_name (pnode), "description"))
                {
                  rss_read_copy (item->desc, xml_get_childnode (pnode));
//                  found = 1;
                }
              else if (!strcasecmp (xml_get_name (pnode), "date") ||
                       !strcasecmp (xml_get_name (pnode), "pubDate") ||
                       !strcasecmp (xml_get_name (pnode), "dc:date") ||
                       !strcasecmp (xml_get_name (pnode), "cropDate"))
                { 
                  item->date = strptime2 ((const char *) xml_get_string (xml_get_childnode (pnode)));
//                  found = 1;
                }
//              else if (!strcasecmp (xml_get_name (pnode), "duration")) // HACK yt:duration
              else if (stristr (xml_get_name (pnode), "duration")) // HACK yt:duration
                {
                  p = (const char *) xml_get_value (pnode, "seconds");
                  if (p)
                    {
                      item->media.duration = strtol (p, NULL, 10);
//                      found = 1;
//                      break;
                    }
                }
//              else if (!strcasecmp (xml_get_name (pnode), "group")) // media:group
              else if (stristr (xml_get_name (pnode), "group")) // media:group
                {
                  rss_open_rss_mrss (pnode, item);
                }
              else if (!strcasecmp (xml_get_name (pnode), "author") ||
                       !strcasecmp (xml_get_name (pnode), "dc:creator") ||
                       !strcasecmp (xml_get_name (pnode), "creator"))
                {
                    rss_read_copy (item->user, xml_get_childnode (pnode));
//                  found = 1;
                }
#if 0
              else
                {
                  if (!found) // possibly malformed feed
                    break;
                  else
                    found = 0;
                }
#endif

              pnode = xml_get_nextnode (pnode);
            }

          // some feeds use the guid tag for the link
          if (*link)
            strcpy (item->url, link);
          else if (*guid)
            strcpy (item->url, guid);
          else
            *(item->url) = 0;

          rss->item_count++;

          if (rss->item_count == RSSMAXITEM)
            break;
        }

//      rss->item_count++;

      node = xml_get_nextnode (node);
    }

#ifdef  DEBUG
  rss_st_rss_t_sanity_check (rss);
  fflush (stdout);
#endif

  return rss;
}
Ejemplo n.º 5
0
void pascal_dataset<Tdata>::count_sample(Node::NodeList &olist) {
  uint difficult = 0, truncated = 0, occluded = 0;
  std::string obj_classname, pose;
  bool pose_found = false;
  Node::NodeList::iterator oiter;

  for(oiter = olist.begin(); oiter != olist.end(); ++oiter) {
    if (!strcmp((*oiter)->get_name().c_str(), "difficult"))
      difficult = xml_get_uint(*oiter);
    else if (!strcmp((*oiter)->get_name().c_str(), "truncated"))
      truncated = xml_get_uint(*oiter);
    else if (!strcmp((*oiter)->get_name().c_str(), "occluded"))
      occluded = xml_get_uint(*oiter);
    else if (!strcmp((*oiter)->get_name().c_str(), "name"))
      xml_get_string(*oiter, obj_classname);
    else if (!strcmp((*oiter)->get_name().c_str(), "pose")) {
      xml_get_string(*oiter, pose);
      pose_found = true;
    }
  }

  ////////////////////////////////////////////////////////////////
  // object
  if (!usepartsonly) {
    // add object's class to dataset
    if (included_pascal(obj_classname, difficult, truncated, occluded)) {
      if (usepose && pose_found) { // append pose to class name
        obj_classname += "_";
        obj_classname += pose;
      }
      if (included_pascal(obj_classname, difficult, truncated, occluded))
        this->add_class(obj_classname);
    }
  }
  // increment samples numbers
  total_samples++;
  if (difficult) total_difficult++;
  if (truncated) total_truncated++;
  if (occluded) total_occluded++;
  if ((difficult && ignore_difficult)
      || (truncated && ignore_truncated)
      || (occluded && ignore_occluded))
    total_ignored++;

  ////////////////////////////////////////////////////////////////
  // parts
  if (useparts || usepartsonly) {
    std::string part_classname;

    // add part's class to dataset
    for(oiter = olist.begin();oiter != olist.end(); ++oiter) {
      if (!strcmp((*oiter)->get_name().c_str(), "part")) {
        // get part's name
        Node::NodeList plist = (*oiter)->get_children();
        for(Node::NodeList::iterator piter = plist.begin();
            piter != plist.end(); ++piter) {
          if (!strcmp((*piter)->get_name().c_str(), "name")) {
            xml_get_string(*piter, part_classname);
            // found a part and its name, add it
            if (included_pascal(part_classname, difficult, truncated, occluded))
            {
              if (usepose && pose_found) { // append pose to class name
                part_classname += "_";
                part_classname += pose;
              }
              if (dataset<Tdata>::included(part_classname)) {
                this->add_class(part_classname);
                // increment samples numbers
                this->total_samples++;
                if (difficult) total_difficult++;
                if (truncated) total_truncated++;
                if (occluded) total_occluded++;
                if ((difficult && ignore_difficult)
                    || (truncated && ignore_truncated)
                    || (occluded && ignore_occluded))
                  total_ignored++;
              }
            }
          }
        }
      }
    }
  }
}