int main(int argc, char **argv) { char *filename = argv[1]; htmlDocPtr doc; int parse_flags = HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NOBLANKS |HTML_PARSE_COMPACT | HTML_PARSE_RECOVER; LIBXML_TEST_VERSION if (argc != 2) { fprintf(stderr, "Usage: %s in.html\n", argv[0]); return(1); } pre_trim(filename); doc = htmlReadFile(filename, NULL, parse_flags); if (doc == NULL) { fprintf(stderr, "Failed to parse %s\n", filename); return; } trim(doc); output(doc); xmlFreeDoc(doc); xmlCleanupParser(); }
/** * ExtractBoardList * コンストラクタ */ ExtractBoardList::ExtractBoardList(const char* file) { // HTML読み込み用構造体 htmlDocPtr m_doc; // SQLiteAccessorのインスタンスを準備する SQLiteAccessor* accessor = new SQLiteAccessor(); boardInfoArray = new wxArrayString(); // ファイル名とエンコードの設定 const char* enc = "utf-8"; // HTMLの読み込み m_doc = htmlReadFile(file, enc, HTML_PARSE_RECOVER ); if (NULL == m_doc) { // NULLが返された場合その時点で終了する xmlCleanupParser(); xmlCleanupCharEncodingHandlers(); delete accessor; delete boardInfoArray; return; } // htmlNodePtrに変換する htmlNodePtr root = xmlDocGetRootElement(m_doc); if (NULL == root) { // NULLが返された場合その時点で終了する xmlCleanupParser(); xmlCleanupCharEncodingHandlers(); delete accessor; delete boardInfoArray; return; } else { // 正常処理 FindBoardInfo(root); xmlCleanupParser(); xmlCleanupCharEncodingHandlers(); } accessor->SetBoardInfoCommit(boardInfoArray); delete accessor; delete boardInfoArray; }
int convert_station_forecacom_data(const char *station_id_with_path, const char *result_file, const char *detail_path_data ){ xmlDoc *doc = NULL; xmlNode *root_node = NULL; int days_number = -1; char buffer[1024], buffer2[1024], *delimiter = NULL; FILE *file_out; if(!station_id_with_path) return -1; /* check for new file, if it exist, than rename it */ *buffer = 0; snprintf(buffer, sizeof(buffer) - 1, "%s.new", station_id_with_path); if(!access(buffer, R_OK)) rename(buffer, station_id_with_path); /* check file accessability */ if(!access(station_id_with_path, R_OK)){ /* check that the file containe valid data */ doc = htmlReadFile(station_id_with_path, "UTF-8", HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); if(!doc) return -1; root_node = xmlDocGetRootElement(doc); if(root_node->type == XML_ELEMENT_NODE && strstr((char*)root_node->name, "err")){ xmlFreeDoc(doc); xmlCleanupParser(); return -2; } else{ /* prepare station id */ *buffer = 0; *buffer2 = 0; snprintf(buffer2, sizeof(buffer2) - 1, "%s", station_id_with_path); delimiter = strrchr(buffer2, '/'); if(delimiter){ delimiter++; /* delete '/' */ snprintf(buffer, sizeof(buffer) - 1, "%s", delimiter); delimiter = strrchr(buffer, '.'); if(!delimiter){ xmlFreeDoc(doc); xmlCleanupParser(); return -1; } *delimiter = 0; // if(get_detail_data) // days_number = parse_xml_detail_data(buffer, root_node, data); // else days_number = parse_and_write_xml_data(buffer, doc, result_file); xmlFreeDoc(doc); xmlCleanupParser(); if(!access(detail_path_data, R_OK)){ doc = htmlReadFile(detail_path_data, "UTF-8", HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING); if(doc){ root_node = NULL; root_node = xmlDocGetRootElement(doc); if(!root_node || ( root_node->type == XML_ELEMENT_NODE && strstr((char*)root_node->name, "err"))){ xmlFreeDoc(doc); xmlCleanupParser(); } else{ parse_and_write_detail_data(buffer, doc, result_file); xmlFreeDoc(doc); xmlCleanupParser(); } } } if (days_number > 0){ file_out = fopen(result_file, "a"); if (file_out){ fprintf(file_out,"</station>"); fclose(file_out); } } } } } else return -1;/* file isn't accessability */ return days_number; }
GPtrArray * dict_get_translate(const gchar * keyword) { gchar * key_url = NULL; htmlDocPtr html_doc = NULL; xmlDocPtr doc = NULL; xmlNodePtr clone = NULL; xmlXPathContextPtr ctx = NULL; xmlXPathObjectPtr obj = NULL; xmlNodeSetPtr nodeset = NULL; gchar * tmp_str = NULL; GPtrArray * result = NULL; gint i; key_url = g_strjoin(NULL, KEY_URL_PREFIX, keyword, KEY_URL_SUFIX, NULL); g_debug("KEY-URL: %s", key_url); html_doc = htmlReadFile(key_url, NULL, HTML_PARSE_NOWARNING | HTML_PARSE_NOERROR); if (!html_doc) { g_message("KEY-URL %s get failed", key_url); goto out; } ctx = xmlXPathNewContext((xmlDocPtr)html_doc); if (!ctx) { g_message("XPath context creat failed"); goto out; } obj = xmlXPathEvalExpression(KEY_WORD_XPATH, ctx); if (!obj) { g_message("XPath eval key word xpath failed"); goto out; } if (xmlXPathNodeSetIsEmpty(obj->nodesetval)) { g_message("XPath search keyword failed"); goto out; } nodeset = obj->nodesetval; g_debug("Key word node set have %d object", nodeset->nodeNr); clone = xmlCopyNode(nodeset->nodeTab[0], 1); if (!clone) goto out; doc = xmlNewDoc("1.0"); if (!doc) goto out; xmlDocSetRootElement(doc, clone); xmlXPathFreeContext(ctx); ctx = NULL; xmlXPathFreeObject(obj); obj = NULL; ctx = xmlXPathNewContext(doc); if (!ctx) goto out; obj = xmlXPathEvalExpression("//span[@class='keyword']", ctx); if (!obj) goto out; nodeset = obj->nodesetval; tmp_str = xmlNodeGetContent(nodeset->nodeTab[0]->xmlChildrenNode); g_debug("The word to search %s", tmp_str); xmlFree(tmp_str); tmp_str = NULL; xmlXPathFreeObject(obj); obj = NULL; obj = xmlXPathEvalExpression("//ul/li", ctx); if (!obj) goto out; if (xmlXPathNodeSetIsEmpty(obj->nodesetval)) { g_message("Result value is empty"); goto out; } nodeset = obj->nodesetval; result = g_ptr_array_sized_new(nodeset->nodeNr + 1); for (i = 0; i < nodeset->nodeNr; ++i) { tmp_str = xmlNodeGetContent(nodeset->nodeTab[i]->xmlChildrenNode); g_ptr_array_add(result, tmp_str); } g_ptr_array_add(result, NULL); out: if (doc) xmlFreeDoc(doc); if (key_url) g_free(key_url); if (html_doc) xmlFreeDoc((xmlDocPtr)html_doc); if (ctx) xmlXPathFreeContext(ctx); if (obj) xmlXPathFreeObject(obj); return result; }
convert_station_hkogovhk_data(const gchar *station_id_with_path, const gchar *result_file, const gchar *detail_path_data ){ xmlDoc *doc = NULL; xmlNode *root_node = NULL; gint days_number = -1; gchar buffer[1024], *delimiter = NULL; FILE *file_out; file_out = fopen(result_file, "w"); if (!file_out) return -1; /* prepare station id */ *buffer = 0; delimiter = strrchr(station_id_with_path, '/'); if(delimiter){ delimiter++; /* delete '/' */ snprintf(buffer, sizeof(buffer) - 1, "%s", delimiter); } fprintf(file_out,"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<station name=\"Station name\" id=\"%s\" xmlns=\"http://omweather.garage.maemo.org/schemas\">\n", buffer); fprintf(file_out," <units>\n <t>C</t>\n <ws>m/s</ws>\n <wg>m/s</wg>\n <d>km</d>\n"); fprintf(file_out," <h>%%</h> \n <p>mmHg</p>\n </units>\n"); fclose(file_out); parse_current_weather(detail_path_data, result_file); parse_forecast_weather(station_id_with_path, result_file); file_out = fopen(result_file, "a"); if (file_out){ fprintf(file_out,"</station>"); fclose(file_out); } return 0; if(!station_id_with_path) return -1; /* check for new file, if it exist, than rename it */ *buffer = 0; snprintf(buffer, sizeof(buffer) - 1, "%s.new", station_id_with_path); if(!access(buffer, R_OK)) rename(buffer, station_id_with_path); /* check file accessability */ if(!access(station_id_with_path, R_OK)){ /* check that the file containe valid data */ doc = htmlReadFile(station_id_with_path, "UTF-8", 0); if(!doc) return -1; root_node = xmlDocGetRootElement(doc); if(root_node->type == XML_ELEMENT_NODE && strstr((char*)root_node->name, "err")){ xmlFreeDoc(doc); xmlCleanupParser(); return -2; } else{ /* prepare station id */ *buffer = 0; delimiter = strrchr(station_id_with_path, '/'); if(delimiter){ delimiter++; /* delete '/' */ snprintf(buffer, sizeof(buffer) - 1, "%s", delimiter); delimiter = strrchr(buffer, '.'); if(!delimiter){ xmlFreeDoc(doc); xmlCleanupParser(); return -1; } *delimiter = 0; // if(get_detail_data) // days_number = parse_xml_detail_data(buffer, root_node, data); // else days_number = parse_and_write_xml_data(buffer, doc, result_file); xmlFreeDoc(doc); xmlCleanupParser(); if(!access(detail_path_data, R_OK)){ doc = htmlReadFile(detail_path_data, "UTF-8", 0); if(doc){ root_node = NULL; root_node = xmlDocGetRootElement(doc); if(!root_node || ( root_node->type == XML_ELEMENT_NODE && strstr((char*)root_node->name, "err"))){ xmlFreeDoc(doc); xmlCleanupParser(); } else{ parse_and_write_detail_data(buffer, doc, result_file); xmlFreeDoc(doc); xmlCleanupParser(); } } } if (days_number > 0){ file_out = fopen(result_file, "a"); if (file_out){ fprintf(file_out,"</station>"); fclose(file_out); } } } } } else return -1;/* file isn't accessability */ return days_number; }
int feed_main(match_data **feed_matches, int *feed_matches_counter) { omnibet_match_data omnibet_match; GSList *cookies; char tmp_file[1024]; char tmp_file2[1024]; char tmp_file3[1024]; memset(&omnibet_match.match_time[0], '\0', sizeof(omnibet_match.match_time)); memset(&omnibet_match.team_home[0], '\0', sizeof(omnibet_match.team_home)); memset(&omnibet_match.team_away[0], '\0', sizeof(omnibet_match.team_away)); omnibet_match.score_home = 0; omnibet_match.score_away = 0; omnibet_match.stage = -1; omnibet_match.skip = FALSE; struct passwd *pw = getpwuid(getuid()); snprintf(&tmp_file[0], sizeof(tmp_file), "%s-%u", OMNIBET_FILENAME, pw->pw_uid); snprintf(&tmp_file2[0], sizeof(tmp_file), "%s-%u-a", OMNIBET_FILENAME, pw->pw_uid); // Get our cookie from main page if (get_url(OMNIBET_URL1, OMNIBET_USER_AGENT, &tmp_file[0], NULL, &cookies)) return 1; // Fetch actual page if (get_url(OMNIBET_URL2, OMNIBET_USER_AGENT, &tmp_file[0], cookies, NULL)) { if (cookies) soup_cookies_free(cookies); return 1; } if (cookies) soup_cookies_free(cookies); char *orig_xml = omnibet_load_file(&tmp_file[0]); if (!orig_xml) return 0; char *fixed_xml2 = omnibet_replace(orig_xml, "<strong>", "<strong custom=livescore>"); if (!fixed_xml2) return 0; FILE *fp = fopen (&tmp_file2[0], "w"); if (!fp) { printf("Cannot open output file!\n"); return 0; } fprintf(fp, "%s\n", fixed_xml2); fclose(fp); htmlDocPtr parser = htmlReadFile(&tmp_file2[0], OMNIBET_CHARSET, HTML_PARSE_RECOVER | //HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | #ifdef HAVE_MATE HTML_PARSE_NOIMPLIED | #endif HTML_PARSE_COMPACT); omnibet_walk_tree(xmlDocGetRootElement(parser), &omnibet_match, feed_matches, feed_matches_counter); xmlFreeDoc(parser); free(orig_xml); free(fixed_xml2); return 1; }
char * read_scheda(char *baseUrl, xmlChar *url, char *epgdb_root) { static char buf[2048]; int i; FILE *fd; char cachefile[strlen(epgdb_root) + strlen((char*)url) + 2]; htmlDocPtr docScheda = NULL; xmlXPathContextPtr contextScheda = NULL; xmlXPathObjectPtr par = NULL; xmlChar *urlScheda = NULL; /* build cache filename */ buf[0]='\0'; strcpy(cachefile, epgdb_root); cachefile[strlen(epgdb_root)] = '/'; for (i=0; i<strlen((char*)url); i++) if (url[i] == '/' || url[i] == '\\' || url[i] == '?' || url[i] == '&' || url[i] == '=') cachefile[i+strlen(epgdb_root)+1] = '_'; else cachefile[i+strlen(epgdb_root)+1] = url[i]; cachefile[i+strlen(epgdb_root)+1] = '\0'; /* try to read from cache */ fd = fopen(cachefile, "r"); if (fd) { fread(buf, 2048, 1, fd); fclose(fd); return buf; } /* ok... no cache... download it! */ urlScheda = xmlBuildURI(url, (xmlChar *)baseUrl); if (urlScheda != NULL ) { docScheda = htmlReadFile((char *)urlScheda, NULL, HTML_PARSE_RECOVER|HTML_PARSE_NOERROR|HTML_PARSE_NOWARNING); if (docScheda != NULL ) { contextScheda = xmlXPathNewContext(docScheda); if (contextScheda != NULL) { // Prende il primo paragrafo sotto la div con id="box" par = xmlXPathEvalExpression((const xmlChar *)"//div[@id='box']/p[1]", contextScheda); if (par != NULL && !xmlXPathNodeSetIsEmpty(par->nodesetval)) { append_scheda(buf, par->nodesetval->nodeTab[0]->children, 0, 2048); xmlXPathFreeObject(par); } xmlXPathFreeContext(contextScheda); } xmlFreeDoc(docScheda); } xmlFree(urlScheda); } /* save the cache */ if (strlen(buf) > 0) { fd = fopen(cachefile, "w"); if (fd) { fwrite(buf, strlen(buf)+1, 1, fd); fclose(fd); } } return buf; }
void test_html() { htmlDocPtr doc; doc = htmlNewDoc( "", "" ); xmlNodePtr root_node = xmlNewNode(NULL,BAD_CAST"ap"); //设置根节点 xmlDocSetRootElement(doc,root_node); // cur = xmlDocGetRootElement(doc); //获取文档根结点 //在根节点中直接创建节点 xmlNewTextChild(root_node, NULL, BAD_CAST "newNode1", BAD_CAST "newNode1 content"); xmlNewTextChild(root_node, NULL, BAD_CAST "newNode2", BAD_CAST "newNode2 content"); xmlNewTextChild(root_node, NULL, BAD_CAST "newNode3", BAD_CAST "newNode3 content"); /* xmlChar *key; key = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1); //获取文本结点的文本,需用其子结点 xmlFree(key); */ //创建一个绑定在根节点的子节点 xmlNewChild(root_node, NULL, BAD_CAST "node1",BAD_CAST "content of node1"); /* xmlNodeSetContent(curNode, (xmlChar *) "content changed");//设置结点的文本内容 //得到一个节点的内容: //xmlChar *value = xmlNodeGetContent(node); //返回值value应该使用xmlFree(value)释放内存 xmlUnlinkNode(curNode); //将当前结点从文档中断链(unlink),这样本文档就不会再包含这个子结点 xmlFreeNode(curNode); //手动删除断链结点的内存, 若没有xmlDelNode或者xmlRemoveNode,使用此函数 xmlChar *uri; uri = xmlGetProp(cur, "uri"); //获取属性值 xmlFree(uri); //释放内存 xmlSetProp(curNode,BAD_(xmlChar *)"attribute", (xmlChar *) "no"); //设置当前结点的attribute属性的属性值为no */ //创建一个节点,设置其内容和属性,然后加入根结点 xmlNodePtr node = xmlNewNode(NULL,BAD_CAST"node2"); xmlNodePtr content = xmlNewText(BAD_CAST"NODE CONTENT"); xmlAddChild(root_node,node); xmlAddChild(node,content); xmlNewProp(node,BAD_CAST"attribute",BAD_CAST "yes"); //通过xmlNewProp()增加一个节点的属性 node=xmlNewChild(root_node, NULL, BAD_CAST "node3", BAD_CAST"node has attributes"); xmlNewProp(node, BAD_CAST "attribute", BAD_CAST "no"); //创建一个儿子和孙子节点 node = xmlNewNode(NULL, BAD_CAST "son"); xmlAddChild(root_node,node); xmlNodePtr grandson = xmlNewNode(NULL, BAD_CAST "grandson"); xmlAddChild(node,grandson); xmlAddChild(grandson, xmlNewText(BAD_CAST "This is a grandson node")); //存储xml文档 int nRel = xmlSaveFile("CreatedXml.xml",doc); if (nRel != -1) { printf("%s\n",d_ConvertCharset("GBK", "utf-8", "一个xml文档被创建\n")); } //保存文件 /* * xmlSaveFormatFile (docname, doc, 1); 保存文件到磁盘,第一个参数是写入文件的名,第二个参数是一个xmlDoc结构指针,第三个参数设定为1,保证在输出上写入缩进。 */ xmlSaveFormatFileEnc( "-", doc, "UTF-8", 1); doc=htmlReadFile("ap", NULL, -1); xmlSaveFormatFileEnc( "-", doc, "UTF-8", 1); }