void WebPage::parse () { ifstream fin; fin.open(infile.c_str()); if(!fin){ throw infile; } string temp; while(getline(fin,temp)) //while there are lines to get lfile.push_back(temp); //add it to the list fin.close(); string line; for(unsigned int i =0; i < lfile.size(); i++){ line = lfile.at(i); //for each item i the list, set to line while(!line.empty()){ //while each line still has characters string temp; int k=0; for( ;isalnum(line[k]) || line[k] == '['; k++){ //checks if alphanumeric if(line[k] == '['){ //appropriately parses anchor text k++; } temp.push_back(line[k]); //pushes back character until next word } if(line[k] == ']' && line[k+1] == '('){ string link; k+=2; while(line[k] != ')'){ link.push_back(line[k]); k++; } //find matching strings, //if found just add the mapped webpage as outgoing link //if not create a dummy link, add that to the map with the filename //then add the dummy link to the outgoing links map<string,WebPage*>::iterator it = allLinks->find(link); if(it!=allLinks->end()){ addOutgoingLink(it->second); } else{ WebPage *dummy = new WebPage(link,allLinks); (*allLinks)[link] = dummy; addOutgoingLink(dummy); } } for(int j = 0; temp[j]; j++){ //converts to lower case temp[j] = tolower(temp[j]); } if(words.find(temp)==words.end() && !temp.empty()) //makes sure there is no duplicate word words.insert(temp); line.erase(0,k+1); //moves to the next word in the string } } }
WebPage::WebPage (std::string filename) { std::ifstream file; name = filename; file.open(filename); if (file.fail()) { throw(std::runtime_error(std::string("Failed to open file."))); } else { fileGiven = true; } while(!file.fail()) { std::string buffer = ""; file >> buffer; if(!file.fail()) { data += (buffer + " "); } } for(unsigned int i = 0; i < data.length(); i++) { data[i] = tolower(data[i]); } file.close(); std::stringstream ss; ss.str(data); while(!ss.fail()) { std::string word = ""; ss >> word; unsigned int start, end = 0; for(unsigned int i = 0; i < word.length(); i++) { if(word[i] == '[') { start = i+1; } if(word[i] == ']') { end = i; } } if(!(start == 0 && end == 0) && start < word.length() && end < word.length()) { string otherFileName = word.substr(start, end - start); addOutgoingLink(otherFileName); } } }
void WebPage::parse (string total, string temp) { string linkName = ""; string linkFile = ""; int start = 0; int end = 0; for (unsigned int j = 0; j < 2; j++) { for (unsigned int i = 0; i < total.length(); i++) { if (total[i] == '[') { start = i; i++; while (total[i] != ']') { linkName += total[i]; i++; } i++; if (total[i] == '(') { i++; while (total[i] != ')') { linkFile += total[i]; i++; } i++; end = i; total.erase(total.begin()+start, total.begin()+end); addOutgoingLink(linkFile); linkName = ""; linkFile = ""; } } } } totalWebPage = total; replace(total.begin(), total.end(), '!', ' '); replace(total.begin(), total.end(), '\\', ' '); replace(total.begin(), total.end(), '#', ' '); replace(total.begin(), total.end(), '$', ' '); replace(total.begin(), total.end(), '%', ' '); replace(total.begin(), total.end(), '&', ' '); replace(total.begin(), total.end(), '\'', ' '); replace(total.begin(), total.end(), '(', ' '); replace(total.begin(), total.end(), ')', ' '); replace(total.begin(), total.end(), '*', ' '); replace(total.begin(), total.end(), '+', ' '); replace(total.begin(), total.end(), ',', ' '); replace(total.begin(), total.end(), '-', ' '); replace(total.begin(), total.end(), '.', ' '); replace(total.begin(), total.end(), '/', ' '); replace(total.begin(), total.end(), ':', ' '); replace(total.begin(), total.end(), ';', ' '); replace(total.begin(), total.end(), '<', ' '); replace(total.begin(), total.end(), '>', ' '); replace(total.begin(), total.end(), '=', ' '); replace(total.begin(), total.end(), '?', ' '); replace(total.begin(), total.end(), '@', ' '); replace(total.begin(), total.end(), '[', ' '); replace(total.begin(), total.end(), ']', ' '); replace(total.begin(), total.end(), '\"', ' '); replace(total.begin(), total.end(), '/', ' '); replace(total.begin(), total.end(), '^', ' '); replace(total.begin(), total.end(), '_', ' '); replace(total.begin(), total.end(), '\'', ' '); replace(total.begin(), total.end(), '{', ' '); replace(total.begin(), total.end(), '}', ' '); replace(total.begin(), total.end(), '|', ' '); replace(total.begin(), total.end(), '~', ' '); stringstream buffer(total); while (buffer >> temp) { locale loc; string lowerTemp; for(unsigned int j = 0; j < temp.length(); j++) { lowerTemp = lowerTemp + tolower(temp[j], loc); } words.insert(lowerTemp); } }