Exemplo n.º 1
0
Page * HTMLParser::Parse(URL _urlToParse)
	{
		try
		{
			urlToParse = _urlToParse;
			Page * pageToReturn = new Page(urlToParse);
			URLInputStream stream(urlToParse.getURL()); HTMLTokenizer Tokenizer(&stream);
			StringURLResolver Resolver;
			while(Tokenizer.HasNextToken())
			{
				HTMLToken current_tok = Tokenizer.GetNextToken();
				if(current_tok.GetType() == COMMENT) continue; //If comment, skip.
				if(current_tok.GetType() == TAG_START) //Marks flags for when we see an opening tag.
				{
					string tag_value = current_tok.GetValue();
					StringUtil::ToLower(tag_value);

					MarkStartFlags(tag_value);

					if(tag_value == "a") //Extract, resolve, and add links.
					{
						string url = current_tok.GetAttribute("HREF");
						URL urlToAdd(url);

						//Before adding a URL, do some checks, clean URL up, then add.
						PreAddChecks(url, urlToAdd, Resolver, _urlToParse);
					}
					if(tag_value[0] == 'h' && isdigit(tag_value[1]))
						inH = true;
				}
				MarkEndFlags(current_tok); //Check current token to see if we are in title or body
				if(current_tok.GetType() == TEXT) //If it's text, send it off for processing
					processBlockOfText(current_tok.GetValue(), pageToReturn);
			}
			if(!pageToReturn->hasDescription()) //Pre-return check, see if title has 100 chars. Add if not
				pageToReturn->SetDescription(BuildPossibleDescription());
			return pageToReturn;
		}
		catch (CS240Exception & e) //If the page could not be downloaded, catch error. Do nothing.
		{
			cout << e.GetMessage() << endl;
		}
		return NULL;
	}