static void rss_read_copy (char *d, xml_node_t* n) { const char *p = (const char *) xml_get_string (n); if (p) strncpy (d, p, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0; else *d = 0; }
bool pascalfull_dataset<Tdata>::process_xml(const string &xmlfile) { string image_filename; string image_fullname; string obj_classname; // parse xml file try { DomParser parser; // parser.set_validate(); parser.parse_file(xmlfile); if (parser) { // initialize root node and list const Node* pNode = parser.get_document()->get_root_node(); Node::NodeList list = pNode->get_children(); // get image filename for(Node::NodeList::iterator iter = list.begin(); iter != list.end(); ++iter) { if (!strcmp((*iter)->get_name().c_str(), "filename")) { xml_get_string(*iter, image_filename); iter = list.end(); iter--; // stop loop } } image_fullname = imgroot; image_fullname += image_filename; // parse all objects in image for(Node::NodeList::iterator iter = list.begin(); iter != list.end(); ++iter) { if (!strcmp((*iter)->get_name().c_str(), "object")) { Node::NodeList olist = (*iter)->get_children(); for(Node::NodeList::iterator oiter = olist.begin(); oiter != olist.end(); ++oiter) { if (!strcmp((*oiter)->get_name().c_str(), "name")) { xml_get_string(*oiter, obj_classname); // if object's name matches an excluded class, stop this xml if (find(exclude.begin(), exclude.end(), obj_classname) != exclude.end()) return false; } } } } } } catch (const std::exception& ex) { cerr << "error: Xml exception caught: " << ex.what() << endl; return false; } catch (const char *err) { cerr << "error: " << err << endl; return false; } // copy image into output directory ostringstream cmd; ostringstream tgt; tgt << outdir << "/" << image_filename; cmd << "cp " << image_fullname << " " << tgt.str(); if (std::system(cmd.str().c_str())) cerr << "warning: failed to execute: " << cmd.str() << endl; else { cout << data_cnt << ": copied " << tgt.str() << endl; data_cnt++; } return true; }
static st_rss_t * rss_open_atom (st_rss_t *rss, const char *encoding) { xml_doc_t *doc; xml_node_t *node; const char *p = NULL; doc = xml_parse (rss->url, encoding); if (!doc) { fprintf (stderr, "ERROR: cannot read %s\n", rss->url); return NULL; } node = xml_get_rootnode (doc); if (!node) { fprintf (stderr, "ERROR: empty document %s\n", rss->url); xml_free (doc); return NULL; } node = xml_get_childnode (node); while (node && xml_is_empty_node (node)) node = xml_get_nextnode (node); if (!node) { // fprintf (stderr, ""); return NULL; } while (node) { while (node && xml_is_empty_node (node)) node = xml_get_nextnode (node); if (!node) break; if (!strcasecmp (xml_get_name (node), "title")) rss_read_copy (rss->title, xml_get_childnode (node)); else if (!strcasecmp (xml_get_name (node), "description")) rss_read_copy (rss->desc, xml_get_childnode (node)); // else if (!strcasecmp (xml_get_name (node), "link")) // rss_read_copy (rss->url, xml_get_childnode (node)); else if (!strcasecmp (xml_get_name (node), "date") || !strcasecmp (xml_get_name (node), "pubDate") || !strcasecmp (xml_get_name (node), "dc:date") || !strcasecmp (xml_get_name (node), "modified") || !strcasecmp (xml_get_name (node), "updated")) rss->date = strptime2 ((const char *) xml_get_string (xml_get_childnode (node))); else if ((!strcasecmp (xml_get_name (node), "entry"))) { xml_node_t *pnode = xml_get_childnode (node); st_rss_item_t *item = &rss->item[rss->item_count]; // int found = 0; char link[RSSMAXBUFSIZE]; *link = 0; while (pnode) { while (pnode && xml_is_empty_node (pnode)) pnode = xml_get_nextnode (pnode); if (!pnode) break; #ifdef DEBUG printf ("%s\n", xml_get_name (pnode)); fflush (stdout); #endif if (!strcasecmp (xml_get_name (pnode), "title")) { rss_read_copy (item->title, xml_get_childnode (pnode)); // found = 1; } #if 0 else if (!strcasecmp (xml_get_name (pnode), "id")) { rss_read_copy (link, xml_get_childnode (pnode)); // found = 1; } #endif else if (!strcasecmp (xml_get_name (pnode), "link") && (!(*link))) { #if 0 <link rel="alternate" type="text/html" href="http://edition.cnn.com/2006/POLITICS/11/01/kerry.remarks/"/> #endif p = (const char *) xml_get_value (pnode, "href"); if (p) { strncpy (link, p, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0; // found = 1; } } else if (!strcasecmp (xml_get_name (pnode), "content")) { rss_read_copy (item->desc, xml_get_childnode (pnode)); // found = 1; } else if (!strcasecmp (xml_get_name (pnode), "author")) { xml_node_t *tnode = xml_get_childnode (pnode); if (!strcasecmp (xml_get_name (tnode), "name")) rss_read_copy (item->user, xml_get_childnode (tnode)); // found = 1; } else if (!strcasecmp (xml_get_name (pnode), "modified") || !strcasecmp (xml_get_name (pnode), "updated")) { item->date = strptime2 ((const char *) xml_get_string (xml_get_childnode (pnode))); // found = 1; } // else if (!strcasecmp (xml_get_name (pnode), "duration")) // HACK yt:duration else if (stristr (xml_get_name (pnode), "duration")) // HACK yt:duration { p = (const char *) xml_get_value (pnode, "seconds"); if (p) { item->media.duration = strtol (p, NULL, 10); // found = 1; // break; } } // else if (!strcasecmp (xml_get_name (pnode), "group")) // media:group else if (stristr (xml_get_name (pnode), "group")) // media:group #if 1 { rss_open_rss_mrss (pnode, item); } #else { xml_node_t *tnode = xml_get_childnode (pnode); while (tnode) { if (!tnode) break; // if (!strcasecmp (xml_get_name (tnode), "content")) // media:content if (stristr (xml_get_name (tnode), "content")) // media:content { p = (const char *) xml_get_value (tnode, "duration"); if (p) { item->media.duration = strtol (p, NULL, 10); // found = 1; // break; } p = (const char *) xml_get_value (tnode, "filesize"); if (p) { item->media.filesize = strtol (p, NULL, 10); // found = 1; // break; } p = (const char *) xml_get_value (tnode, "width"); if (p) { item->media.width = strtol (p, NULL, 10); // found = 1; // break; } p = (const char *) xml_get_value (tnode, "height"); if (p) { item->media.height = strtol (p, NULL, 10); // found = 1; // break; } } // else if (!strcasecmp (xml_get_name (tnode), "keywords")) // media:keywords else if (stristr (xml_get_name (tnode), "keywords")) // media:keywords { rss_read_copy (item->media.keywords, xml_get_childnode (tnode)); } // else if (!strcasecmp (xml_get_name (tnode), "thumbnail")) // media:thumbnail else if (stristr (xml_get_name (tnode), "thumbnail")) // media:thumbnail { p = (const char *) xml_get_value (tnode, "url"); if (p) if (!(item->media.thumbnail[0])) { strncpy (item->media.thumbnail, p, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0; // found = 1; // break; } } tnode = xml_get_nextnode (tnode); } } #endif pnode = xml_get_nextnode (pnode); } if (*link) strcpy (item->url, link); rss->item_count++; if (rss->item_count == RSSMAXITEM) break; }
static st_rss_t * rss_open_rss (st_rss_t *rss, const char *encoding) { xml_doc_t *doc; xml_node_t *node; int rdf = 0; doc = xml_parse (rss->url, encoding); if (!doc) { fprintf (stderr, "ERROR: cannot read %s\n", rss->url); return NULL; } node = xml_get_rootnode (doc); if (!node) { fprintf (stderr, "ERROR: empty document %s\n", rss->url); xml_free (doc); return NULL; } // rdf? // TODO: move this to rss_demux() if (strcasecmp (xml_get_name (node), "rss") != 0 && !strcasecmp (xml_get_name (node), "rdf")) rdf = 1; node = xml_get_childnode (node); while (node && xml_is_empty_node (node)) node = xml_get_nextnode (node); if (!node) { // fprintf (stderr, ""); return NULL; } if (strcasecmp (xml_get_name (node), "channel")) { fprintf (stderr, "ERROR: bad document: did not immediately find the RSS element\n"); return NULL; } if (!rdf) // document is RSS node = xml_get_childnode (node); while (node) { while (node && xml_is_empty_node (node)) node = xml_get_nextnode (node); if (!node) break; if (!strcasecmp (xml_get_name (node), "title")) rss_read_copy (rss->title, xml_get_childnode (node)); else if (!strcasecmp (xml_get_name (node), "description")) rss_read_copy (rss->desc, xml_get_childnode (node)); // else if (!strcasecmp (xml_get_name (node), "link")) // rss_read_copy (rss->url, xml_get_childnode (node)); else if (!strcasecmp (xml_get_name (node), "date") || !strcasecmp (xml_get_name (node), "pubDate") || !strcasecmp (xml_get_name (node), "dc:date")) rss->date = strptime2 ((const char *) xml_get_string (xml_get_childnode (node))); else if (!strcasecmp (xml_get_name (node), "channel") && rdf) { xml_node_t *pnode = xml_get_childnode (node); while (pnode) { if (!strcasecmp (xml_get_name (pnode), "title")) rss_read_copy (rss->title, xml_get_childnode (pnode)); else if (!strcasecmp (xml_get_name (pnode), "description")) rss_read_copy (rss->desc, xml_get_childnode (pnode)); else if (!strcasecmp (xml_get_name (pnode), "date") || !strcasecmp (xml_get_name (pnode), "pubDate") || !strcasecmp (xml_get_name (pnode), "dc:date")) rss->date = strptime2 ((const char *) xml_get_string (xml_get_childnode (pnode))); pnode = xml_get_nextnode (pnode); } } else if (!strcasecmp (xml_get_name (node), "item") || !strcasecmp (xml_get_name (node), "entry")) { xml_node_t *pnode = xml_get_childnode (node); st_rss_item_t *item = &rss->item[rss->item_count]; // int found = 0; const char *p = NULL; char link[RSSMAXBUFSIZE], guid[RSSMAXBUFSIZE]; printf ("SHIT"); fflush (stdout); *link = *guid = 0; while (pnode) { while (pnode && xml_is_empty_node (pnode)) pnode = xml_get_nextnode (pnode); if (!pnode) break; #ifdef DEBUG printf ("%s\n", xml_get_name (pnode)); fflush (stdout); #endif if (!strcasecmp (xml_get_name (pnode), "title")) { rss_read_copy (item->title, xml_get_childnode (pnode)); // found = 1; } else if (!strcasecmp (xml_get_name (pnode), "link")) { rss_read_copy (link, xml_get_childnode (pnode)); // found = 1; } #if 0 else if (!strcasecmp (xml_get_name (pnode), "enclosure")) { p = (const char *) xml_get_value (pnode, "url"); if (p) { strncpy (link, p, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0; // found = 1; } } #endif else if (!strcasecmp (xml_get_name (pnode), "guid") && (!(*link))) { rss_read_copy (guid, xml_get_childnode (pnode)); // found = 1; } else if (!strcasecmp (xml_get_name (pnode), "description")) { rss_read_copy (item->desc, xml_get_childnode (pnode)); // found = 1; } else if (!strcasecmp (xml_get_name (pnode), "date") || !strcasecmp (xml_get_name (pnode), "pubDate") || !strcasecmp (xml_get_name (pnode), "dc:date") || !strcasecmp (xml_get_name (pnode), "cropDate")) { item->date = strptime2 ((const char *) xml_get_string (xml_get_childnode (pnode))); // found = 1; } // else if (!strcasecmp (xml_get_name (pnode), "duration")) // HACK yt:duration else if (stristr (xml_get_name (pnode), "duration")) // HACK yt:duration { p = (const char *) xml_get_value (pnode, "seconds"); if (p) { item->media.duration = strtol (p, NULL, 10); // found = 1; // break; } } // else if (!strcasecmp (xml_get_name (pnode), "group")) // media:group else if (stristr (xml_get_name (pnode), "group")) // media:group { rss_open_rss_mrss (pnode, item); } else if (!strcasecmp (xml_get_name (pnode), "author") || !strcasecmp (xml_get_name (pnode), "dc:creator") || !strcasecmp (xml_get_name (pnode), "creator")) { rss_read_copy (item->user, xml_get_childnode (pnode)); // found = 1; } #if 0 else { if (!found) // possibly malformed feed break; else found = 0; } #endif pnode = xml_get_nextnode (pnode); } // some feeds use the guid tag for the link if (*link) strcpy (item->url, link); else if (*guid) strcpy (item->url, guid); else *(item->url) = 0; rss->item_count++; if (rss->item_count == RSSMAXITEM) break; } // rss->item_count++; node = xml_get_nextnode (node); } #ifdef DEBUG rss_st_rss_t_sanity_check (rss); fflush (stdout); #endif return rss; }
void pascal_dataset<Tdata>::count_sample(Node::NodeList &olist) { uint difficult = 0, truncated = 0, occluded = 0; std::string obj_classname, pose; bool pose_found = false; Node::NodeList::iterator oiter; for(oiter = olist.begin(); oiter != olist.end(); ++oiter) { if (!strcmp((*oiter)->get_name().c_str(), "difficult")) difficult = xml_get_uint(*oiter); else if (!strcmp((*oiter)->get_name().c_str(), "truncated")) truncated = xml_get_uint(*oiter); else if (!strcmp((*oiter)->get_name().c_str(), "occluded")) occluded = xml_get_uint(*oiter); else if (!strcmp((*oiter)->get_name().c_str(), "name")) xml_get_string(*oiter, obj_classname); else if (!strcmp((*oiter)->get_name().c_str(), "pose")) { xml_get_string(*oiter, pose); pose_found = true; } } //////////////////////////////////////////////////////////////// // object if (!usepartsonly) { // add object's class to dataset if (included_pascal(obj_classname, difficult, truncated, occluded)) { if (usepose && pose_found) { // append pose to class name obj_classname += "_"; obj_classname += pose; } if (included_pascal(obj_classname, difficult, truncated, occluded)) this->add_class(obj_classname); } } // increment samples numbers total_samples++; if (difficult) total_difficult++; if (truncated) total_truncated++; if (occluded) total_occluded++; if ((difficult && ignore_difficult) || (truncated && ignore_truncated) || (occluded && ignore_occluded)) total_ignored++; //////////////////////////////////////////////////////////////// // parts if (useparts || usepartsonly) { std::string part_classname; // add part's class to dataset for(oiter = olist.begin();oiter != olist.end(); ++oiter) { if (!strcmp((*oiter)->get_name().c_str(), "part")) { // get part's name Node::NodeList plist = (*oiter)->get_children(); for(Node::NodeList::iterator piter = plist.begin(); piter != plist.end(); ++piter) { if (!strcmp((*piter)->get_name().c_str(), "name")) { xml_get_string(*piter, part_classname); // found a part and its name, add it if (included_pascal(part_classname, difficult, truncated, occluded)) { if (usepose && pose_found) { // append pose to class name part_classname += "_"; part_classname += pose; } if (dataset<Tdata>::included(part_classname)) { this->add_class(part_classname); // increment samples numbers this->total_samples++; if (difficult) total_difficult++; if (truncated) total_truncated++; if (occluded) total_occluded++; if ((difficult && ignore_difficult) || (truncated && ignore_truncated) || (occluded && ignore_occluded)) total_ignored++; } } } } } } } }