Ejemplo n.º 1
0
void WebPage::parse () {
  ifstream fin;
  fin.open(infile.c_str());
  if(!fin){
    throw infile;
  }
  string temp;
  while(getline(fin,temp))  //while there are lines to get
    lfile.push_back(temp);       //add it to the list
  fin.close();
  string line;
  for(unsigned int i =0; i < lfile.size(); i++){
    line = lfile.at(i);      //for each item i the list, set to line
    while(!line.empty()){     //while each line still has characters
      string temp;
      int k=0;

      for( ;isalnum(line[k]) || line[k] == '['; k++){              //checks if alphanumeric
        if(line[k] == '['){                         //appropriately parses anchor text
          k++;  
        }
        temp.push_back(line[k]);            //pushes back character until next word
      }

      if(line[k] == ']' && line[k+1] == '('){ 
        string link;
        k+=2;
        while(line[k] != ')'){
          link.push_back(line[k]);
          k++;
        }
        //find matching strings, 
        //if found just add the mapped webpage as outgoing link
        //if not create a dummy link, add that to the map with the filename 
        //then add the dummy link to the outgoing links
        map<string,WebPage*>::iterator it = allLinks->find(link);
        if(it!=allLinks->end()){
          addOutgoingLink(it->second);
        }
        else{
          WebPage *dummy = new WebPage(link,allLinks);
          (*allLinks)[link] = dummy;
          addOutgoingLink(dummy);
        }
      }

      for(int j = 0; temp[j]; j++){      //converts to lower case
        temp[j] = tolower(temp[j]);
      }

      if(words.find(temp)==words.end() && !temp.empty())  //makes sure there is no duplicate word
        words.insert(temp);
      line.erase(0,k+1);        //moves to the next word in the string
    }
}
}
Ejemplo n.º 2
0
WebPage::WebPage (std::string filename) {
	std::ifstream file;
	name = filename;
	file.open(filename);
	if (file.fail()) {
		throw(std::runtime_error(std::string("Failed to open file.")));
	} else {
		fileGiven = true;
	}
	while(!file.fail()) {
		std::string buffer = "";
		file >> buffer;
		if(!file.fail()) {
			data += (buffer + " ");
		}
	}
	for(unsigned int i = 0; i < data.length(); i++) {
		data[i] = tolower(data[i]);
	}
	file.close();

	std::stringstream ss;
	ss.str(data);
	while(!ss.fail()) {
		std::string word = "";
		ss >> word;
		unsigned int start, end = 0;
		for(unsigned int i = 0; i < word.length(); i++) {		
			if(word[i] == '[') {
				start = i+1;
			}
			if(word[i] == ']') {
				end = i;
			}
		}
		if(!(start == 0 && end == 0) && start < word.length() && end < word.length()) {
			string otherFileName = word.substr(start, end - start);
			addOutgoingLink(otherFileName);
		}
	}
}
Ejemplo n.º 3
0
void WebPage::parse (string total, string temp)
{
	string linkName = "";
	string linkFile = "";
	int start = 0;
	int end = 0;
	
	for (unsigned int j = 0; j < 2; j++)
	{
		for (unsigned int i = 0; i < total.length(); i++)
		{
			if (total[i] == '[')
			{
				start = i;
				i++;
				while (total[i] != ']')
				{
					linkName += total[i];
					i++; 
				}
				i++;
				if (total[i] == '(')
				{
					i++;
					while (total[i] != ')')
					{
						linkFile += total[i];
						i++;
					}
					i++;
					end = i;
					total.erase(total.begin()+start, total.begin()+end);
					addOutgoingLink(linkFile);
					linkName = "";
					linkFile = "";
				}
			}
		}
	}
	
	totalWebPage = total;

	replace(total.begin(), total.end(), '!', ' ');
	replace(total.begin(), total.end(), '\\', ' ');
	replace(total.begin(), total.end(), '#', ' ');
	replace(total.begin(), total.end(), '$', ' ');
	replace(total.begin(), total.end(), '%', ' ');
	replace(total.begin(), total.end(), '&', ' ');
	replace(total.begin(), total.end(), '\'', ' ');
	replace(total.begin(), total.end(), '(', ' ');
	replace(total.begin(), total.end(), ')', ' ');
	replace(total.begin(), total.end(), '*', ' ');
	replace(total.begin(), total.end(), '+', ' ');
	replace(total.begin(), total.end(), ',', ' ');
	replace(total.begin(), total.end(), '-', ' ');
	replace(total.begin(), total.end(), '.', ' ');
	replace(total.begin(), total.end(), '/', ' ');
	replace(total.begin(), total.end(), ':', ' ');
	replace(total.begin(), total.end(), ';', ' ');
	replace(total.begin(), total.end(), '<', ' ');
	replace(total.begin(), total.end(), '>', ' ');
	replace(total.begin(), total.end(), '=', ' ');
	replace(total.begin(), total.end(), '?', ' ');
	replace(total.begin(), total.end(), '@', ' ');
	replace(total.begin(), total.end(), '[', ' ');
	replace(total.begin(), total.end(), ']', ' ');
	replace(total.begin(), total.end(), '\"', ' ');
	replace(total.begin(), total.end(), '/', ' ');
	replace(total.begin(), total.end(), '^', ' ');
	replace(total.begin(), total.end(), '_', ' ');
	replace(total.begin(), total.end(), '\'', ' ');
	replace(total.begin(), total.end(), '{', ' ');
	replace(total.begin(), total.end(), '}', ' ');
	replace(total.begin(), total.end(), '|', ' ');
	replace(total.begin(), total.end(), '~', ' ');

	stringstream buffer(total);
	while (buffer >> temp)
	{
		locale loc;
		string lowerTemp;
		for(unsigned int j = 0; j < temp.length(); j++)
		{
			lowerTemp = lowerTemp + tolower(temp[j], loc);
		}
		words.insert(lowerTemp);
	}
}