Exemplo n.º 1
1
std::shared_ptr<GumboOutputWrapper> GumboOutputWrapper::createShared(
    const string &htmlStr) {
    auto output = gumbo_parse(htmlStr.c_str());
    return std::make_shared<GumboOutputWrapper>(output);
}
int main(int argc, char** argv) {
  if (argc != 2) {
    puts("Usage: clean_text <html filename>");
    exit(EXIT_FAILURE);
  }
  const char* filename = argv[1];

  FILE *in = fopen(filename, "rb");
  /*
   *if (!in) {
   *  std::cout << "File " << filename << " not found!\n";
   *  exit(EXIT_FAILURE);
   *}
   */

  /*
   *std::string contents;
   *in.seekg(0, std::ios::end);
   *contents.resize(in.tellg());
   *in.seekg(0, std::ios::beg);
   *in.read(&contents[0], contents.size());
   *in.close();
   */
    char *contents = malloc(65535);
    fread(contents, 1, 65535, in);
  GumboOutput* output = gumbo_parse(contents);
  puts(cleantext(output->root));
  gumbo_destroy_output(&kGumboDefaultOptions, output);
}
Exemplo n.º 3
0
string HtmlParser::buscarTextoInTag(char *html, Tags *tag) {

    GumboOutput* output = gumbo_parse(html);
    string res = search_text(output->root, tag);
    gumbo_destroy_output(&kGumboDefaultOptions, output);
    return res;
}
Exemplo n.º 4
0
void HtmlParser::buscarElem2(char *html, Tags *tag1, string atributoElem1, string valAtributoElem1
                               , Tags *tag2, string atributoElem2) {

    listUrlInfoSong.clear();
    GumboNode* enlace = NULL;
    GumboOutput* output = gumbo_parse(html);
    vector <string> resultados;
    GumboNode* ret = search_for_elem2(output->root, tag1, atributoElem1, valAtributoElem1, tag2, atributoElem2);
    gumbo_destroy_output(&kGumboDefaultOptions, output);
}
Exemplo n.º 5
0
list_t *wiki_registry(const char *url) {
  response_t *res = http_get(url);
  if (!res->ok) return NULL;

  GumboOutput* output = gumbo_parse(res->data);

  list_t *pkgs = list_new();
  wiki_registry_find_body(output->root, pkgs);

  gumbo_destroy_output(&kGumboDefaultOptions, output);

  return pkgs;
}
Exemplo n.º 6
0
string HtmlParser::buscarElem(char *html, Tags *tag, string atributo, string valAtributo) {
    GumboOutput* output = gumbo_parse(html);
    string ret = search_for_elem(output->root, tag, atributo, valAtributo);

    char *arr = new char[ret.length()+1];
    strcpy(arr, ret.c_str());
    Constant::utf8ascii(arr);
    ret = string(arr);
    delete [] arr;

    gumbo_destroy_output(&kGumboDefaultOptions, output);
    return ret;
}
Exemplo n.º 7
0
DocInfo HTMLParser::parse(RICPNS::Document &document) {

	oneurl curl;
	string html;

	cleanText(document.getText(), html);
	GumboOutput* output = gumbo_parse(html.c_str());
	GumboNode* node = output->root;

	string docUrl = document.getURL();
	string content, pageTitle;
	list<pair<string, string> > links;

//	thread t1(&HTMLParser::extractContent, this, node, ref(content));
//	thread t2(&HTMLParser::extractPageTitle, this, node, ref(pageTitle));
//	thread t3(&HTMLParser::extractLinks, this, node, ref(links), ref(docUrl));

	extractContent(node, content);
	extractPageTitle(node, pageTitle);
	extractLinks(node, links, docUrl);

//	t1.join();
//	t2.join();
//	t3.join();

	gumbo_destroy_output(&kGumboDefaultOptions, output);

	DocInfo docInfo;
	docInfo.setContent(content);

	docInfo.setCanonicalUrl(
			curl.Parse(docUrl) ?
					curl.CNormalize(docUrl) : docUrl);

	docInfo.setUrl(docUrl);

	docInfo.setTitle(pageTitle);
	docInfo.setLinks(links);
	//	cout << docInfo.getUrl() << "  -  "<<  "   " << link << endl;

//	static int i=1;
//	cout << i++ << " - " << docInfo.getCanonicalUrl() << endl;
//
//
//	for(pair<string, string> link : links){
//		cout <<  " ------- " << link.first << endl;
//		cout <<  link.second << endl;
//	}

	return docInfo;
}
Exemplo n.º 8
0
void Client::Parser(CCUserDataPtr &data, CCLinksPtr &links)
{
	CCLinksPtr link(new CCLinks);
	std::string html=data->getdata();
	GumboOutput *output;
	std::vector<std::string> vec;

	output = gumbo_parse(html.c_str());
	if(!output)
	{
		return;
	}
	getlinks(output->root, vec);
	gumbo_destroy_output(&kGumboDefaultOptions, output);

	link->setlinks(vec);
	data->setdata(html);

	links = link;
}
Exemplo n.º 9
0
    void t2HTMLParser::t2LabelParser::parse(const char* html)
    {
        GumboOutput* output = gumbo_parse(html);

        GumboNode *root = output->root;

        if(root->type != GUMBO_NODE_ELEMENT)
        {
            t2PrintError("html格式有误");
            return;
        }

        // find head
        GumboVector *rootChildren = &root->v.element.children;
        GumboNode *h = NULL, *b = NULL;
        for(int i = 0; i < rootChildren->length; i++)
        {
            GumboNode *child = (GumboNode *) rootChildren->data[i];
            if(child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_HEAD)
                h = child;
            else if(child->type == GUMBO_NODE_ELEMENT && child->v.element.tag == GUMBO_TAG_BODY)
                b = child;
        }

        if(!h)
        {
            t2PrintError("html文件缺少<head>标签");
            return;
        }
        head(h);

        // find body
        if(!b)
        {
            t2PrintError("html文件缺少<body>标签");
            return;
        }
        body(b);

        gumbo_destroy_output(&kGumboDefaultOptions, output);
    }
Exemplo n.º 10
0
GumboOutputWrapper GumboOutputWrapper::create(
    const string &htmlStr) {
    auto output = gumbo_parse(htmlStr.c_str());
    return GumboOutputWrapper(output);
}
Exemplo n.º 11
0
void HtmlParser::buscarElementos(char *html, Tags *tag) {
    GumboOutput* output = gumbo_parse(html);
    search_for_links(output->root, tag);
    gumbo_destroy_output(&kGumboDefaultOptions, output);
}
Exemplo n.º 12
0
bool SteamUserCrawler::run() {
	printf("Started Running User Crawler\n");
	
	gettimeofday(&start, NULL);
	// Connect To DB
	dbConn->connect();
	gettimeofday(&end, NULL);
	printf("DB Connected (time consumed : %ldms)\n", this->calTime());

	sql::Statement *stmt;
	sql::ResultSet *res;
	sql::PreparedStatement *pstmt;
	pstmt = dbConn->con->prepareStatement("INSERT INTO user(url, name, steamlv) VALUES (?, ?, ?) ON DUPLICATE KEY UPDATE name=VALUES(name), steamlv=VALUES(steamlv);");   

	string page;
	
	string userName;
	int userLevel;

    string url = this->seedURL;
    if(url == "r") {
	    gettimeofday(&start, NULL);
		stmt = dbConn->con->createStatement();
		res = stmt->executeQuery("SELECT url FROM user WHERE name IS NULL ORDER BY RAND() LIMIT 1;");
		gettimeofday(&end, NULL);
		if(res->next()) {
			url = res->getString(1);
			printf("Getting Random Seed URL from DB Done (time consumed : %ldms)\n", this->calTime());
		}
		delete stmt;
		delete res;
    }
    
	while(url != "") {
		gettimeofday(&start, NULL);
		string page = curl->getPage(url);
		gettimeofday(&end, NULL);
		printf("Getting User Profile Page Done (time consumed : %ldms)\n", this->calTime());
		
		userName = "";
		userLevel = -1;

		// Getting User Name & User Steam Level
		if(page != "") {
			gettimeofday(&start, NULL);
			GumboOutput *output = gumbo_parse(page.c_str());
			gettimeofday(&end, NULL);
			printf("User Profile Page Parsing Done (time consumed : %ldms)\n", this->calTime());
			printf("Current URL [%s]\n", url.c_str());
			
			queue<GumboNode *> nodes;
			nodes.push(output->root);
			
			gettimeofday(&start, NULL);
			while(!nodes.empty() && (userName == "" || userLevel == -1)) {
				GumboNode *node = nodes.front();
				nodes.pop();
				
				if(node->type != GUMBO_NODE_ELEMENT) {
					continue;
				}
				
				GumboAttribute *attr;
	
				// User Name
				if((node->v.element.tag == GUMBO_TAG_DIV) &&
				(attr = gumbo_get_attribute(&node->v.element.attributes, "class")) &&
				(strcmp(attr->value, "persona_name") == 0)) {
					GumboVector *aChild = &node->v.element.children;
					for(size_t i = 0; i < aChild->length; i++) {
						GumboNode *aNode = static_cast<GumboNode *>(aChild->data[i]);
						if(aNode->type == GUMBO_NODE_TEXT) {
							userName = aNode->v.text.text;
							trim(userName);
						}
					}
					continue;
				}
	
				// User Steam Level
				if((node->v.element.tag == GUMBO_TAG_DIV) &&
				(attr = gumbo_get_attribute(&node->v.element.attributes, "class")) &&
				(strcmp(attr->value, "persona_name persona_level") == 0)) {
					GumboVector *aChild = &node->v.element.children;
					for(size_t i = 0; i < aChild->length; i++) {
						GumboNode *aNode = static_cast<GumboNode *>(aChild->data[i]);
						if((aNode->type == GUMBO_NODE_ELEMENT) &&
						(aNode->v.element.tag == GUMBO_TAG_DIV)) {
							GumboVector *bChild = &aNode->v.element.children;
							for(size_t j = 0; j < bChild->length; j++) {
								GumboNode *bNode = static_cast<GumboNode *>(bChild->data[j]);
								if((bNode->type == GUMBO_NODE_ELEMENT) &&
								(bNode->v.element.tag == GUMBO_TAG_SPAN)) {
									GumboVector *cChild = &bNode->v.element.children;
									for(size_t k = 0; k < cChild->length; k++) {
										GumboNode *cNode = static_cast<GumboNode *>(cChild->data[k]);
										if(cNode->type == GUMBO_NODE_TEXT) {
											stringstream ss;
											ss << cNode->v.text.text;
											ss >> userLevel;
										}
									}
								}
							}
						}
					}
Exemplo n.º 13
0
void crawl_parse_parse( crawl_parse_t *c, uint8_t *content ) {
	c->output = gumbo_parse( (const char *)content );
	crawl_parse_real(c, c->output->root);
	gumbo_destroy_output( &kGumboDefaultOptions, c->output );
}
Exemplo n.º 14
0
Article::Article(const std::string& path, const bool detectRedirects) {
  invalid = false;

  /* aid */
  aid = path.substr(directoryPath.size()+1);

  /* url */
  url = aid;

  /* mime-type */
  mimeType = getMimeTypeForFile(aid);
  
  /* namespace */
  ns = getNamespaceForMimeType(mimeType)[0];

  /* HTML specific code */
  if (mimeType.find("text/html") != std::string::npos) {
    std::size_t found;
    std::string html = getFileContent(path);
    GumboOutput* output = gumbo_parse(html.c_str());
    GumboNode* root = output->root;

    /* Search the content of the <title> tag in the HTML */
    if (root->type == GUMBO_NODE_ELEMENT && root->v.element.children.length >= 2) {
      const GumboVector* root_children = &root->v.element.children;
      GumboNode* head = NULL;
      for (int i = 0; i < root_children->length; ++i) {
	GumboNode* child = (GumboNode*)(root_children->data[i]);
	if (child->type == GUMBO_NODE_ELEMENT &&
	    child->v.element.tag == GUMBO_TAG_HEAD) {
	  head = child;
	  break;
	}
      }

      if (head != NULL) {
	GumboVector* head_children = &head->v.element.children;
	for (int i = 0; i < head_children->length; ++i) {
	  GumboNode* child = (GumboNode*)(head_children->data[i]);
	  if (child->type == GUMBO_NODE_ELEMENT &&
	      child->v.element.tag == GUMBO_TAG_TITLE) {
	    if (child->v.element.children.length == 1) {
	      GumboNode* title_text = (GumboNode*)(child->v.element.children.data[0]);
	      if (title_text->type == GUMBO_NODE_TEXT) {
		title = title_text->v.text.text;
		stripTitleInvalidChars(title);
	      }
	    }
	  }
	}

	/* Detect if this is a redirection (if no redirects CSV specified) */
	std::string targetUrl;
	try {
	  targetUrl = detectRedirects ? extractRedirectUrlFromHtml(head_children) : "";
	} catch (std::string &error) {
	  std::cerr << error << std::endl;
	}
	if (!targetUrl.empty()) {
	  redirectAid = computeAbsolutePath(aid, decodeUrl(targetUrl));
	  if (!fileExists(directoryPath + "/" + redirectAid)) {
	    redirectAid.clear();
	    invalid = true;
	  }
	}
      }

      /* If no title, then compute one from the filename */
      if (title.empty()) {
	found = path.rfind("/");
	if (found != std::string::npos) {
	  title = path.substr(found+1);
	  found = title.rfind(".");
	  if (found!=std::string::npos) {
	    title = title.substr(0, found);
	  }
	} else {
	  title = path;
	}
	std::replace(title.begin(), title.end(), '_',  ' ');
      }
    }

    gumbo_destroy_output(&kGumboDefaultOptions, output);
  }
}
Exemplo n.º 15
0
HTMLParser::HTMLParser()
{
    GumboOutput* output = gumbo_parse("<h1>Hello, World!</h1>");
    // Do stuff with output->root
    gumbo_destroy_output(&kGumboDefaultOptions, output);
}
Exemplo n.º 16
0
bool doFetchLesson(Class t, ThreadSafeQueue<Class> *classList, std::atomic<int> *totalLessons)
{
	std::string buffer;
	CURL *curl;
	CURLcode res;
	
	weekAmountList.lock();
	int weekAmount = -1;
	for (int i = 0; i < weekAmountList.size(); i++)
	{
		if (weekAmountList.at(i).departmentStringId == t.departmentString() && weekAmountList.at(i).cpath == t.cpath())
		{
			weekAmount = weekAmountList.at(i).amountOfWeeks;
			break;
		}
	}
	if(weekAmount == -1)
		printf("ERROR: NOT FOUND: %s - %s\n", t.departmentString().c_str(), t.cpath().c_str());
	weekAmountList.unlock();
	
	char postFields[1024] = "";
	for (int i = 1; i < weekAmount+1; i++)
	{
		int n = sprintf(postFields, "%sweken[]=%d&", postFields, i);
		postFields[n] = '\0';
	}
	int n = sprintf(postFields, "%ssleutelveld=%s&object=%s&filter=%s", postFields, t.classIdString().c_str(), t.cpath().c_str(), t.departmentString().c_str());
	postFields[n] = '\0';
	//printf("%s\n", postFields);
	curl = curl_easy_init();
	curl_easy_setopt(curl, CURLOPT_URL, "https://rooster.nhl.nl/1516/rooster.php");
	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, writer);
	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
	curl_easy_setopt(curl, CURLOPT_POSTFIELDS, postFields);
	//curl_easy_setopt(curl, CURLOPT_REFERER, NHL_REFERER);
	curl_easy_setopt(curl, CURLOPT_TIMEOUT, CURL_TIMEOUT); // 5 sec time out on whole request
	curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, CURL_CONNECT_TIMEOUT); // 10 sec connect time out
	res = curl_easy_perform(curl);
	//printf("------\n%s------\n\n", buffer.c_str());
	if (res == CURLE_OK)
	{
		GumboOutput* output = gumbo_parse(buffer.c_str());
		GumboNode* node = GetTBodyNode(output->root);
		if (node == NULL)
		{
			if (t.cpath() == "ttstud"){
				printf("[%s] FAIL(%s-%s-%s)\n", currentDateTime().c_str(), t.className().c_str(), t.departmentString().c_str(), t.cpath().c_str());
				curl_easy_cleanup(curl);
			}
			else{
				printf("[%s] FAIL(%s-%s-%s), aborting program\n", currentDateTime().c_str(), t.className().c_str(), t.departmentString().c_str(), t.cpath().c_str());
				curl_easy_cleanup(curl);
				exit(1);
			}
			return true;
		}
		GumboVector* children = &node->v.element.children;
		bool newDay = false;
		int lessonAmount = 0;
		int yearOffset = -1;
		int titleOffset = -1;
		int locationOffset = -1;
		int teacherOffset = -1;
		int typeOffset = -1;
		int commentOffset = -1;
		int endOffset = -1;
		std::string date;
		char dayName[128];
		int day;
		int month;
		int year;
		for (unsigned int i = 0; i < children->length; ++i) {
			GumboNode *node1 = static_cast<GumboNode*>(children->data[i]);
			if (node1->v.element.tag == GUMBO_TAG_TR){

				GumboAttribute *att = gumbo_get_attribute(&node1->v.element.attributes, "class");
				if (att)
				{
					//printf("TR CLASS: %s\n", att->value);
					std::string value = att->value;
					if (value == "datarij")
					{
						std::string yearType = "";
						GumboNode *startTimeNode = static_cast<GumboNode*>(node1->v.element.children.data[0]);
						std::string endDate = "";
						if (endOffset != -1)
						{
							GumboNode *endTimeNode = static_cast<GumboNode*>(node1->v.element.children.data[endOffset]);
							endDate = GetTextFromElement(endTimeNode);
						}
						if (yearOffset != -1)
						{
							GumboNode *yearNode = static_cast<GumboNode*>(node1->v.element.children.data[yearOffset]); // optional
							if (yearNode)
								yearType = GetTextFromElement(yearNode);
						}
						std::string typeStr = "";
						GumboNode *titleNode = static_cast<GumboNode*>(node1->v.element.children.data[titleOffset]);
						GumboNode *locationNode = static_cast<GumboNode*>(node1->v.element.children.data[locationOffset]);
						std::string teacher = "";
						if (teacherOffset != -1)
						{
							GumboNode *teacherNode = static_cast<GumboNode*>(node1->v.element.children.data[teacherOffset]);
							teacher = GetTextFromElement(teacherNode);
						}
						//if(teacher == "" && t.departmentString() == "TEE" && t.cpath() == "stud"){
						//printf("Teacher empty: %s - %s\n", GetTextFromElement(titleNode).c_str(), GetTextFromElement(startTimeNode).c_str());
						//printf("Buffer:\n%s\n----\n", buffer.c_str());
						//}
						//printf("%s\n", teacher.c_str());
						if (typeOffset != -1)
						{							
							GumboNode *typeNode = static_cast<GumboNode*>(node1->v.element.children.data[typeOffset]);
							typeStr = GetTextFromElement(typeNode);
						}
						GumboNode *commentsNode = static_cast<GumboNode*>(node1->v.element.children.data[commentOffset]);
						std::string startDate = GetTextFromElement(startTimeNode);
						int startHour;
						int startMinute;
						sscanf(startDate.c_str(), "%02d:%02d", &startHour, &startMinute);
						char newStartDate[128];
						//YYYY-MM-DDTHH:MM:SS
						int n = sprintf(newStartDate, "%04d-%02d-%02dT%02d:%02d:00", year, month, day, startHour, startMinute);
						newStartDate[n] = '\0';

						char newEndDate[128];
						if (endDate != "") {
							int endHour;
							int endMinute;
							sscanf(endDate.c_str(), "%02d:%02d", &endHour, &endMinute);
							//YYYY-MM-DDTHH:MM:SS
							int ne = sprintf(newEndDate, "%04d-%02d-%02dT%02d:%02d:00", year, month, day, endHour, endMinute);
							newEndDate[ne] = '\0';
						}else
							newEndDate[0] = '\0';

						//printf("%s - %s - %s - %s - %s (YEAR: %s)\n", GetTextFromElement(titleNode).c_str(), newStartDate, newEndDate, GetTextFromElement(teacherNode).c_str(), GetTextFromElement(locationNode).c_str(), yearType.c_str());
						int weekNr = getWeekNrFromDate(newStartDate);

						// Calculate time difference to remove old lessons
						time_t timeStampThisWeek = getTimeStampFromDate(newStartDate);
						time_t timeStampCurrentWeek;
						time(&timeStampCurrentWeek);
						//printf("Current week: %d - Lesson week: %d\n", getCurrentWeekNumber(), weekNr);
						double diff = difftime(timeStampThisWeek, timeStampCurrentWeek);
						double weeks = diff / 604800;
						//printf("Time difference: %.f\n", weeks);
						if (weeks > -2)
						{ // ignore old lessons
							t.addLesson(shared_ptr<Lesson>(new Lesson(GetTextFromElement(titleNode), GetTextFromElement(commentsNode), teacher, replaceAll(GetTextFromElement(locationNode), "    ", ", "), newStartDate, newEndDate, weekNr, yearType, typeStr)));
							++*totalLessons;
						}
						lessonAmount++;
					}
					else if (value == "weekheader")
					{
						yearOffset = -1;
						titleOffset = -1;
						locationOffset = -1;
						teacherOffset = -1;
						typeOffset = -1;
						commentOffset = -1;
						GumboVector* children = &node1->v.element.children;
						for (unsigned int i = 0; i < children->length; ++i)
						{
							GumboNode *child = static_cast<GumboNode*>(children->data[i]);
							std::string text = GetTextFromElement(child);
							if (text == "jaar")
								yearOffset = i;
							else if (text == "activiteit")
								titleOffset = i;
							else if (text == "lokaal")
								locationOffset = i;
							else if (text == "docent(en)" || text == "klas(en)")
								teacherOffset = i;
							else if (text == "werkvorm")
								typeOffset = i;
							else if (text == "opmerkingen")
								commentOffset = i;
							else if (text == "eind")
								endOffset = i;
						}
					}
				}
				else
				{
					// no class
					GumboNode *td = static_cast<GumboNode*>(node1->v.element.children.data[0]);
					if (td->v.element.tag == GUMBO_TAG_TD) {
						GumboAttribute *classAtt = gumbo_get_attribute(&td->v.element.attributes, "class");
						if (classAtt)
						{
							std::string dayRow = classAtt->value;
							if (dayRow == "dagrij")
							{
								GumboNode *dateNode = static_cast<GumboNode*>(td->v.element.children.data[0]);
								date = dateNode->v.text.text; // get date with sscanf

								sscanf(date.c_str(), "%s %02d-%02d-%04d", &dayName, &day, &month, &year);
								//printf("New day @ %s\n", date.c_str());
							}
						}
					}
				}
			}
		}
		//printf("Lessons: %d\n", lessonAmount);
		gumbo_destroy_output(&kGumboDefaultOptions, output);
		curl_easy_cleanup(curl);
		classList->lock();
		classList->push(t);
		classList->unlock();
		return true;
	}
	else{
		//printf("[%s] Fail: %s, aborting program", currentDateTime().c_str(), curl_easy_strerror(res));
		curl_easy_cleanup(curl);
		//exit(1);
		return false;
	}
		/* New ICAL style as of semester starting at 01-09-2015 */
		/*
		icalcomponent *rootNode = icalparser_parse_string(cstr);
		icalcomponent *comp = icalcomponent_get_first_component(rootNode, ICAL_VEVENT_COMPONENT);
		//icalcomponent *zoneComp = icalcomponent_get_first_component(rootNode, ICAL_VTIMEZONE_COMPONENT);
		//icaltimezone *zone = icaltimezone_get_builtin_timezone("Europe/Amsterdam");
	
		//icalcomponent *next = icalcomponent_get_next_component(rootNode, ICAL_VEVENT_COMPONENT);
		while (comp != NULL){
			//printf("%s\n", icalcomponent_as_ical_string(comp));
			std::string summary = icalcomponent_get_summary(comp);
			icaltimetype dtstart = icalcomponent_get_dtstart(comp);
			
			std::string starttime = formatDateTime(getTimeStampFromDateAlt(icaltime_as_ical_string(dtstart)));
			icaltimetype dtend = icalcomponent_get_dtend(comp);

			std::string endtime = formatDateTime(getTimeStampFromDateAlt(icaltime_as_ical_string(dtend)));
			const char *locationStr = icalcomponent_get_location(comp); // can be null
			std::string location = "";
			if (locationStr){
				location = locationStr;
				location = replaceAll(location, "    ", ", ");
			}
			std::string commentStr = (char *)icalcomponent_get_comment(comp);
			printf("Comment: %s\n", commentStr.c_str());
			char *comment = (char *)commentStr.c_str();
			char *line = strtok(comment, "\n");
			std::string docenten;
			while (line != NULL){
				if (strstr(line, "Docent(en): ")){
					line += 12; // 12 is length of Docent(en)
					int length = strlen(line);
					docenten = line;
				}
				line = strtok(NULL, "\n");
			}
			int weekNr = getWeekNrFromDate(starttime);
			//printf("%s - %s\n", summary.c_str(), starttime.c_str());
			if (weekNr > getCurrentWeekNumber() - 2) { // ignore old lessons
				t.addLesson(shared_ptr<Lesson>(new Lesson(summary, summary, docenten, location, starttime, endtime, weekNr)));
				printf("1Add Lesson(%s): %s - %s - %s - %s(%s)\n", t.className().c_str(), summary.c_str(), starttime.c_str(), endtime.c_str(), docenten.c_str(), commentStr.c_str());
			}
			icalcomponent_free(comp);
			comp = icalcomponent_get_next_component(rootNode, ICAL_VEVENT_COMPONENT);
		}
		icalcomponent_free(comp);
		icalcomponent_free(rootNode);
		classList->lock();
		classList->push(t);
		classList->unlock();
	*/
		// delete our garbage
		//delete[] refererUrlBuffer;
		//delete[] urlBuffer;
		//delete[] cstr;
		//curl_free(classIdStringEscaped);
		//curl_easy_cleanup(curl);

		//return true;
		/* Old XML Style
		xml_document<> doc;
		doc.parse<0>(cstr);
		xml_node<> *pRoot = doc.first_node();
		if (pRoot == 0)
		{
			std::cout << "doFetchLesson() ERROR: Invalid rootnode" << std::endl;
			exit(1); // Immediately abort program as the document is unreadable
		}
		else if (pRoot != NULL)
		{
			pRoot = pRoot->first_node();
			if (pRoot == 0)
			{
				std::cout << "doFetchLesson() ERROR: Rootnode has an invalid first node" << std::endl;
				exit(1); // Immediately abort program as the document is unreadable
			}
		}
		for (xml_node<> *pNode = pRoot->first_node("item"); pNode; pNode = pNode->next_sibling())
		{
			std::string title = pNode->first_node("title") ? pNode->first_node("title")->value() : "";
			if (title.length() > 2)
				title = title.substr(title.find(": ") + 2, title.length());
			std::string description = pNode->first_node("description") ? pNode->first_node("description")->value() : "";
			std::string teacher = getStringBetween(" - ", " -", description, "([a-zA-Z,. ]+)");

			std::string location = pNode->first_node("ev:location") ? pNode->first_node("ev:location")->value() : "";
			location = trim(location);
			char *locationDecoded = curl_easy_unescape(curl, location.c_str(), 0, NULL);
			location = std::string(locationDecoded);
			curl_free(locationDecoded);
			std::string startdate = pNode->first_node("ev:startdate") ? pNode->first_node("ev:startdate")->value() : "";
			std::string enddate = pNode->first_node("ev:enddate") ? pNode->first_node("ev:enddate")->value() : "";
			int weekNr = getWeekNrFromDate(startdate);
			if (weekNr > getCurrentWeekNumber() - 2) // ignore old lessons
				t.addLesson(shared_ptr<Lesson>(new Lesson(title, description, teacher, location, startdate, enddate, weekNr)));
		}
		// ready to push back the class
		classList->lock();
		classList->push_back(t);
		classList->unlock();

		// delete our garbage
		delete[] refererUrlBuffer;
		delete[] urlBuffer;
		delete[] cstr;
		curl_free(classIdStringEscaped);
		curl_easy_cleanup(curl);
		
		return true;*/
}
Exemplo n.º 17
-1
int main(int argc, char *argv[])
{
//	system("chcp 65001");
	system("chcp 1251");

	// Read HTML file contents
	std::string htmlFileName = "C:\\Projects\\__DATA\\test_page_30.html";
	std::ifstream htmlFileStream(htmlFileName, std::ios::in | std::ios::binary);
	if (!htmlFileStream)
	{
		std::cout << "File " << htmlFileName << " not found!";
		return( EXIT_FAILURE );
	}

	std::string htmlFileContents;
	htmlFileStream.seekg(0, std::ios::end);
	htmlFileContents.resize(htmlFileStream.tellg());
	htmlFileStream.seekg(0, std::ios::beg);
	htmlFileStream.read(&htmlFileContents[0], htmlFileContents.size());
	htmlFileStream.close();

	// Convert it to UTF-8
	// NOTE: Gumbo works ONLY with UTF-8 documents
	char* bufUtf8 = new char[htmlFileContents.length() * 2];
	memset(bufUtf8, 0, htmlFileContents.length() * 2);
	cp1251ToUtf8(bufUtf8, htmlFileContents.c_str());
//	delete[] bufUtf8;
//	return 0;

	// Parse web page contents
	GumboOutput* output = gumbo_parse(/*htmlFileContents.c_str()*/ bufUtf8 );
//    search_for_links(output->root);
	searchForDivBlocks(output->root);

	gumbo_destroy_output(&kGumboDefaultOptions, output);

	delete[] bufUtf8;
	_getchar_nolock();
	return 0;
}