Page * HTMLParser::Parse(URL _urlToParse) { try { urlToParse = _urlToParse; Page * pageToReturn = new Page(urlToParse); URLInputStream stream(urlToParse.getURL()); HTMLTokenizer Tokenizer(&stream); StringURLResolver Resolver; while(Tokenizer.HasNextToken()) { HTMLToken current_tok = Tokenizer.GetNextToken(); if(current_tok.GetType() == COMMENT) continue; //If comment, skip. if(current_tok.GetType() == TAG_START) //Marks flags for when we see an opening tag. { string tag_value = current_tok.GetValue(); StringUtil::ToLower(tag_value); MarkStartFlags(tag_value); if(tag_value == "a") //Extract, resolve, and add links. { string url = current_tok.GetAttribute("HREF"); URL urlToAdd(url); //Before adding a URL, do some checks, clean URL up, then add. PreAddChecks(url, urlToAdd, Resolver, _urlToParse); } if(tag_value[0] == 'h' && isdigit(tag_value[1])) inH = true; } MarkEndFlags(current_tok); //Check current token to see if we are in title or body if(current_tok.GetType() == TEXT) //If it's text, send it off for processing processBlockOfText(current_tok.GetValue(), pageToReturn); } if(!pageToReturn->hasDescription()) //Pre-return check, see if title has 100 chars. Add if not pageToReturn->SetDescription(BuildPossibleDescription()); return pageToReturn; } catch (CS240Exception & e) //If the page could not be downloaded, catch error. Do nothing. { cout << e.GetMessage() << endl; } return NULL; }