Пример #1
0
/*
 * Recorre el archivo y va separando en tokens cada linea. Asigna los terminos al vector de terminos
 */
void Parser::processFile(const char* path, short nro_doc,
		vector<TerminoRegister>* terminos, int* memoriaUsada) {
	string line;
	ifstream fin;
	fin.open(path);
	long posicion = 1;
	while (!fin.eof()) {

		getline(fin, line);
		char* token = strtok((char*) line.c_str(), kSEPARADORES);
		if (token != NULL) {
			Utilidades::string_a_minusculas(token);
		}
		while (token != NULL) {
			if (!Utilidades::isNumber(token) && strlen(token) > 1 && !isStopWord(token)) {

				TerminoRegister termino;
				termino.setDocumento(nro_doc);
				termino.setTermino(token);
				termino.setPosicion(posicion);
				terminos->push_back(termino);
				(*memoriaUsada)++;
			}
			//Tomo el siguiente token

			token = strtok(NULL, kSEPARADORES);
			if (token != NULL) {
				Utilidades::string_a_minusculas(token);
			}
			posicion++;
		}
	}
	fin.close();
}
//kullanicinin girdigi kelimelerin dokumanlarda aranmasi
void inputWord(HASHTABLE *stopWordsHash, HASHTABLE2 *wordHash){
	char inputWord[20];
	int key, address;
	int i;
	int searchCount; //kac defa arandigini tutan degisken
	
	do{
		printf("\n\nkelime giriniz:\n");
		scanf("%s",inputWord);
		
		toLowerCase(inputWord);
		
		//once stop word mu diye bak
		if(isStopWord(inputWord, stopWordsHash, &searchCount, M1) == 1){
			printf("\n kelime stop word, arama sayisi:%d", searchCount);
		}
		else{
			key = keyForHash(inputWord);
			i=0;
			address = hash(key,i,M2);
	
			if(strcmp(wordHash[address].value, inputWord) == 0){
				printf("ilk aramada bulundu");
				for(i=0; i<wordHash[address].textCount; i++)
					printf("\n%s",wordHash[address].textName[i]);
			}
			else{
				do{
					i++;
					address = hash(key,i,M2);
					searchCount = i;
				}while(i<M2 && strcmp(wordHash[address].value, inputWord) != 0);

				if(i == M2)
					printf("\nkelime bulunamadi arama sayisi %d", searchCount); 
				else{
					printf("%d aramada bulundu", i+1);
					for(i=0; i<wordHash[address].textCount; i++)
						printf("\n%s",wordHash[address].textName[i]);
				} 
			}
		}
		
	}while(strcmp(inputWord, "") != 0);
}
Пример #3
0
bool QueryProcessor::translate(const string &str)
{
    stringstream istr(str);
    bool previous_is_operator = true;
    while(!istr.eof())
    {
        string t;
        istr >> t;

        if(!isStopWord(t))
        {
            if(!previous_is_operator)
            {
                Query *p_Space = new QueryOperator(" ");
                p_querySet->push_back(p_Space);
            }

            Query *p = new Query(t);
            p_querySet->push_back(p);

        }
        else if(t == "and" || t == "or")
        {
            Query *p = new QueryOperator(t);
            p_querySet->push_back(p);
            previous_is_operator = true;
        }
    }
    if(p_querySet->empty())
    {
        return false;
    }
    else
    {
        return true;
    }
}
Пример #4
0
 bool endWithStopWord(const std::vector<uint32_t>& termIdList)
 {
     if( termIdList.size() == 0 ) return false;
     if( termIdList.size() == 1 ) return isStopWord(termIdList[0]);
     uint32_t parent = db_.getRootID();
     uint32_t child = 0;
     std::vector<uint32_t>::const_reverse_iterator it = termIdList.rbegin();
     std::pair<bool, bool> r;
     while( it != termIdList.rend() )
     {
         r = db_.isSuffix(*it, parent, child);
         if( !r.first )
         {
             return false;
         }
         if( r.second )
         {
             return true;
         }
         parent = child;
         ++it;
     }
     return false;
 }
int main(){
	char **stopWords; //stop wordleri tutacak matris
	char documentList[15][15];
	int stopWordsCount; //stop word sayisi
	int i, j, k=0; //genel amacli dongu indisleri
	int key; //tabloya yerlestirirken kullanilacak key degiskeni
	int address; //tabloya yerlestirirken kullanilacak adres degiskeni
	int fileCount;
	int searchCount; //kac adimda bulundu
	HASHTABLE *stopWordsHash; //stop wordleri tutacak hash table
	HASHTABLE2 *wordHash;
	
	//stop word listesindeki kelimeleri oku
	stopWords = readStopWords(stopWords, &stopWordsCount);
	
	stopWordsHash = (HASHTABLE *) malloc (M1 * sizeof(HASHTABLE));
	wordHash = (HASHTABLE2 *) malloc (M2 * sizeof(HASHTABLE2));
	
	for(i=0; i<M1; i++){
		stopWordsHash[i].key = -1;
		strcpy(stopWordsHash[i].value, "bos");
	}
	
	for(i=0; i<M2; i++){
		wordHash[i].key = -1;
		wordHash[i].textCount = 0;
		strcpy(wordHash[i].value, "bos");
	}
	
	//stop words icin hash table olustur
	//ARTIK GEREK KALMADI DOSYADAN OKUNACAK
	/*for(i=0; i<stopWordsCount; i++){
		key = keyForHash(stopWords[i]);
		address = hash(key, 0, M1); //hash2 fonksiyonunun degeri ilk denemede sayilmaz
		
		//adres bos kelimeyi yerlestir
		if(stopWordsHash[address].key == -1){
			strcpy(stopWordsHash[address].value, stopWords[i]);
			stopWordsHash[address].key = key;
		}
		else{//cakisma var yeni yer ara
			j = 1;
			k = 0;
			//printf("\ncakisma durumu\n");
			k++;
			do{//bos yer gorene kadar don
				address = hash(key, j, M1);
				j++;
				//printf("\n%d",stopWordsHash[address].key);
				k++;
			}while(stopWordsHash[address].key != -1 && k<541);
			
			if(k==541)
				printf("\n%s kelimesi yerlestirilemedi", stopWords[i]);
			else{
				strcpy(stopWordsHash[address].value, stopWords[i]);
				stopWordsHash[address].key = key;
			}
		}
	}*/
	
	//stop words hash i dosyaya yaz
	//writeFileStopWordsHash(stopWordsHash, M1);
	
	//dosyaya yazili stop words hash table i oku
	readStopWordsHashTable(stopWordsHash, M1);
	
	//okunacak dosya listesini okur	
	readDocumentList(documentList, &fileCount);
	
	for(i=0; i<fileCount; i++){
		FILE *fp = fopen(documentList[i], "r");
		char word[20];
		
		while(fscanf(fp, "%s", word) != EOF){
			toLowerCase(word);
			if(isStopWord(word, stopWordsHash, &searchCount, M2) == 0){
				key = keyForHash(word);
				address = hash(key,0,M2);
				
				if(wordHash[address].key == -1){
					strcpy(wordHash[address].value, word);
					wordHash[address].key = key;
					wordHash[address].where[i] = 1;
					strcpy(wordHash[address].textName[wordHash[address].textCount], documentList[i]);
					wordHash[address].textCount++;
				}
				else if(strcmp(word, wordHash[address].value) == 0){ //ayni kelime daha once eklenmis
					wordHash[address].where[i] = 1;
					if(strcmp(documentList[i],wordHash[address].textName[wordHash[address].textCount-1]) != 0){
						strcpy(wordHash[address].textName[wordHash[address].textCount], documentList[i]);
						wordHash[address].textCount++;
					}
				}
				else{//cakisma var yeni yer ara
					j = 1;
					k = 0;
					//printf("\ncakisma durumu\n");
					k++;
					do{//bos yer gorene kadar veya ayni kelimeyi bulana kadar don
						address = hash(key, j, M2);
						j++;
						//printf("\n%d",stopWordsHash[address].key);
						k++;
					}while((wordHash[address].key != -1 && strcmp(word, wordHash[address].value) != 0) && k<809);
			
					//sil
					
					//sil
					if(k==541)
						printf("\n%s kelimesi yerlestirilemedi", word);
					else if(wordHash[address].key == -1){
						strcpy(wordHash[address].value, word);
						wordHash[address].key = key;
						wordHash[address].where[i] = 1;
						strcpy(wordHash[address].textName[wordHash[address].textCount], documentList[i]);
						wordHash[address].textCount++;
					}
					//kelime tabloda var ve ayni textte değil
					else if(strcmp(word, wordHash[address].value) == 0 && wordHash[address].where[i] ==0){
						//printf("ayni kelime var %s %s \n", word, documentList[i] );
						wordHash[address].where[i] = 1;
						strcpy(wordHash[address].textName[wordHash[address].textCount], documentList[i]);
						wordHash[address].textCount++;
					}
				}
			}
		}
	}//for sonu
	
	inputWord(stopWordsHash, wordHash);
	
	return 0;
}//main sonu
Пример #6
0
/*
Reads a file called fileName into an IndexInterface using rapidxml
@param fileName - the name of the file to index
@param index - the index to populate with information from the file
*/
bool XMLParser::readFileToIndex(string fileName, IndexInterface* index){

   /*
   This is a block of setup code for rapidxml so that I can parse through the xml file efficiently
   */

 	rapidxml::xml_document<> doc;
   rapidxml::xml_node<> * root_node;
   ifstream theFile (fileName);

   vector<char> buffer((istreambuf_iterator<char>(theFile)), istreambuf_iterator<char>());
   buffer.push_back('\0');
   // Parse the buffer using the xml file parsing library into doc 
   doc.parse<0>(&buffer[0]);
   // Find our root node
   root_node = doc.first_node("mediawiki");

   //Save the number of documents that have been indexed
   int num = 0;
   for (rapidxml::xml_node<> * document_node = root_node->first_node("page"); document_node; document_node = document_node->next_sibling()){

      //The text element of the current document node
      rapidxml::xml_node<> * text;

      //Grab the text and document name of the current document
      text = document_node->first_node("revision")->first_node("text");
      string docName = document_node->first_node("title")->value();

      //Ignore documents called user or file. They're garbage
      if(docName.substr(0, 4) == "User" || docName.substr(0,4) == "File") continue;

      //Find and save the author of the last revision
      string docAuthor;
      if(document_node->first_node("revision")->first_node("contributor") != nullptr
      	&& document_node->first_node("revision")->first_node("contributor")->first_node("username") != nullptr)
     	 docAuthor = document_node->first_node("revision")->first_node("contributor")->first_node("username")->value();
      else
      	docAuthor = "No author information given";

      //Find and save the timestamp for the last revision
      string timestamp;
      if(document_node->first_node("revision")->first_node("timestamp") != nullptr)
      	timestamp = document_node->first_node("revision")->first_node("timestamp")->value();
      else
      	timestamp = "No timestamp given";

      string fileName = "SearchDocs/";
      string docNameCopy = docName;
      replace(docNameCopy.begin(), docNameCopy.end(), '/', '.');
      replace(docNameCopy.begin(), docNameCopy.end(), ' ', '_');
      fileName = fileName + docNameCopy;
      ofstream writeDocFile(fileName);


      //Add this document to the index and save it's index in the index number-name registry
      int indexOfDoc = index->addDocument(docName, docAuthor, timestamp);

      //If we've successfully found a text node for this document
      if(text){


      	//Calculate file name for document file
      	 docNameCopy = docName;
      	 replace(docNameCopy.begin(), docNameCopy.end(), '/', ' ');
      	 replace(docNameCopy.begin(), docNameCopy.end(), ':', '.');
      	 stringstream titleStream(docNameCopy);


      	 while(!titleStream.eof()){


            //save the current word and add it to the index for the current document number
         	string word;
         	titleStream >> word;
         	writeDocFile << word << " ";

         	//Check if the word is a stop word or xml garbage
            if(!isStopWord(word) && !isXMLTag(word)){
				
				//If not, remove capitalization, stem it, and index it
				word[0] = tolower(word[0]);
            	Porter2Stemmer::stem(word);
            	index->addWordForDocument(indexOfDoc, word);
            }
          }

         //Make a stringstream of the text element of the document node
         stringstream ss(text->value());
         
         //Parse through the whole stringstream
         while(!ss.eof()){

            //save the current word and add it to the index for the current document number
         	string word;
         	ss >> word;
         	writeDocFile << word << " ";

         	//Check if the word is a stop word or xml garbage
            if(!isStopWord(word) && !isXMLTag(word)){

            	//If not, remove capitalization, stem it, and index it
				word[0] = tolower(word[0]);
            	Porter2Stemmer::stem(word);
            	index->addWordForDocument(indexOfDoc, word);           	
           	}
         }


       }

       //close ofstream for this doc
       writeDocFile.close();
       cout << num++ << docName << endl;
	}

   return true;
}