Example #1
0
void CSSStyleSheet::addSubresourceURLStrings(HashSet<String>& urls, const String& base) const
{        
    RefPtr<CSSRuleList> ruleList = const_cast<CSSStyleSheet*>(this)->cssRules();
    
    // Add the URLs for each child import rule, and recurse for the stylesheet belonging to each of those rules.
    for (unsigned i = 0; i < ruleList->length(); ++i) {
        CSSRule* rule = ruleList->item(i);
        if (rule->type() != CSSRule::IMPORT_RULE)
            continue;

        CSSImportRule* importRule = static_cast<CSSImportRule*>(rule);
        CSSStyleSheet* ruleSheet = importRule->styleSheet();
        if (!ruleSheet)
            continue;

        KURL fullURL(KURL(base), importRule->href());
        urls.add(fullURL.string());
        ruleSheet->addSubresourceURLStrings(urls, fullURL.string());
    }
}
Example #2
0
void page_handler::process() {
	//Ok we need to download the page.
	//Get an instance of curl
	CURL *curl = 0; //Just cause
	cout << "Processing " << myArgs->URL << endl;
	CURLcode res;
	std::stringstream buffer;
	curl = curl_easy_init();
	if (curl) {
		//URL Encode the string as to not break anything.
		string loc(curl_easy_escape(curl, this->myArgs->URL.c_str(), 0));
		//Set download URL
		string get = html_helpers::getDomParserServerURL(loc);
		curl_easy_setopt(curl, CURLOPT_URL, get.c_str());
		//Set our user agent, cause He wants us to.
		curl_easy_setopt(curl, CURLOPT_USERAGENT, "cs3505_NullPointerExceptionBot");
		//No progress bar, just in case
		curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1);
		// Set the callback function to send data to as "write_function" aka convert to stringstream
		curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, this->write_data);
		// Set the buffer to put temp data into as buffer this will allow for string conversion.
		curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buffer);
		//Download that page! 
		res = curl_easy_perform(curl);
		if(res != 0){
			cout << "I died" << endl;
			cout << "Error found " << res << endl;
		}
		
		// always cleanup
		curl_easy_cleanup(curl);
		//Get the results as a string.
		string htmlPage = buffer.str();
		//JSON TIME!
		cJSON *root = cJSON_Parse(htmlPage.c_str());
		if(cJSON_GetObjectItem(root, "Error") != 0){
			//I think this is the error condition if we get a pointer to something
			//searchflag would we like to track errors and log this?
			this->myArgs->Success = false;

			return;
		}

		/*
			images
			links
			rawText
			isAmazon
			isReview
			isProduct
			ReviewDescription
			ReviewerName
			ProductReviewed
		*/

		cJSON *isAmazonNode = cJSON_GetObjectItem(root, "isAmazon");

		cJSON *isProductReviewNode = NULL;
		cJSON *isProductPageNode = NULL;
		cJSON *imagesNode = cJSON_GetObjectItem(root, "images");
		cJSON *linksNode = cJSON_GetObjectItem(root, "links");
		cJSON *rawTextNode = cJSON_GetObjectItem(root, "rawText");
		cJSON *reviewDescriptionNode = NULL;
		cJSON *reviewerNameNode = NULL;
		cJSON *productReviewedNode = NULL;
		cJSON *pageSizeNode = cJSON_GetObjectItem(root, "somethinghere");

		if (isAmazonNode == 0)
		{
			cout << "Error: isAmazonNode is null" << endl;
		}

		if (imagesNode == 0)
		{
			cout << "Error: imagesNode is null" << endl;
		}

		if (linksNode == 0)
		{
			cout << "Error: linksNode is null" << endl;
		}

		if (rawTextNode == 0)
		{
			cout << "Error: rawTextNode is null" << endl;
		}

		// Not used for now
		//if (pageSizeNode == 0)
		//{
		//	cout << "Error: pageSizeNode is null" << endl;
		//}

		if (isAmazonNode != 0)
		{
			myArgs->pageStats.setIsAmazon(isAmazonNode->valueint == 1);

			isProductReviewNode = cJSON_GetObjectItem(root, "isReview");

			if (isProductReviewNode != 0)
			{
				myArgs->pageStats.setIsAmazonReviewPage(isProductReviewNode->valueint == 1);

				reviewDescriptionNode = cJSON_GetObjectItem(root, "reviewDescription");
				reviewerNameNode = cJSON_GetObjectItem(root, "reviewerName");
				productReviewedNode = cJSON_GetObjectItem(root, "productReviewed");
			}

			isProductPageNode = cJSON_GetObjectItem(root, "isProduct");

			if (isProductPageNode != 0)
			{
				myArgs->pageStats.setIsAmazonProductPage(isProductPageNode->valueint == 1);
			}
		}

		int imageCount = imagesNode->valueint;
		//The hard part >.< the array of images.
		//cout << "Found " << imageCount << " images";
		myArgs->pageStats.setTotalImageCount(imageCount);

		int totalLinkCount = cJSON_GetArraySize(linksNode);

		myArgs->pageStats.setTotalLinkCount(totalLinkCount);

		// searchflag set these values
		myArgs->pageStats.setTotalKiloBytes(0);

		string rawText(rawTextNode->valuestring);

		//Ok time to extract the children... Yeah, I hate the lack of C++ Objects but it is so easy.
		//cJSON *linksRoot = cJSON_GetObjectItem(root, "links");
		//if we begin to run down like a mad man i guess.
		//cout << "Found " << cJSON_GetArraySize(linksRoot) << " links" << endl;
		string fullURL("");
		for(int i = 0; i < totalLinkCount; i++){
			fullURL = html_helpers::getFullURL(string(cJSON_GetArrayItem(linksNode,i)->valuestring), this->myArgs->URL);
			this->myArgs->links.push_back(fullURL);
			//cout << "Link to: " << fullURL << endl;
		}

		// send rawText to file reader
		stringstream ss(rawText);

		vector<string> *words;
		int misspelledWordsCount = 0;
		words = FileReader::readLine(&ss);
		misspelledWordsCount += getMisspelledWordsCount(words, myDictionary);

		ss.clear();

		myArgs->pageStats.setTotalWordCount(words->size());
		myArgs->pageStats.setTotalMisspelledWordCount(misspelledWordsCount);

		delete words;

		if (myArgs->pageStats.getIsAmazonReviewPage())
		{
			// Use file reader to get words
			misspelledWordsCount = 0;
			int totalWordsCount = 0;

			if(reviewDescriptionNode != 0)
			{
				ss << reviewDescriptionNode->valuestring;
			}

			words = FileReader::readLine(&ss);
			misspelledWordsCount += getMisspelledWordsCount(words, myDictionary);
			totalWordsCount += words->size();

			myArgs->productReviewer.update(misspelledWordsCount, totalWordsCount, 0, 0);

			delete words;
		}

		if(reviewDescriptionNode != 0)
		{
			string reviewText(reviewDescriptionNode->valuestring);
		}
	}
	else
	{
		this->myArgs->Success = false;
	}
}