/* * Recorre el archivo y va separando en tokens cada linea. Asigna los terminos al vector de terminos */ void Parser::processFile(const char* path, short nro_doc, vector<TerminoRegister>* terminos, int* memoriaUsada) { string line; ifstream fin; fin.open(path); long posicion = 1; while (!fin.eof()) { getline(fin, line); char* token = strtok((char*) line.c_str(), kSEPARADORES); if (token != NULL) { Utilidades::string_a_minusculas(token); } while (token != NULL) { if (!Utilidades::isNumber(token) && strlen(token) > 1 && !isStopWord(token)) { TerminoRegister termino; termino.setDocumento(nro_doc); termino.setTermino(token); termino.setPosicion(posicion); terminos->push_back(termino); (*memoriaUsada)++; } //Tomo el siguiente token token = strtok(NULL, kSEPARADORES); if (token != NULL) { Utilidades::string_a_minusculas(token); } posicion++; } } fin.close(); }
//kullanicinin girdigi kelimelerin dokumanlarda aranmasi void inputWord(HASHTABLE *stopWordsHash, HASHTABLE2 *wordHash){ char inputWord[20]; int key, address; int i; int searchCount; //kac defa arandigini tutan degisken do{ printf("\n\nkelime giriniz:\n"); scanf("%s",inputWord); toLowerCase(inputWord); //once stop word mu diye bak if(isStopWord(inputWord, stopWordsHash, &searchCount, M1) == 1){ printf("\n kelime stop word, arama sayisi:%d", searchCount); } else{ key = keyForHash(inputWord); i=0; address = hash(key,i,M2); if(strcmp(wordHash[address].value, inputWord) == 0){ printf("ilk aramada bulundu"); for(i=0; i<wordHash[address].textCount; i++) printf("\n%s",wordHash[address].textName[i]); } else{ do{ i++; address = hash(key,i,M2); searchCount = i; }while(i<M2 && strcmp(wordHash[address].value, inputWord) != 0); if(i == M2) printf("\nkelime bulunamadi arama sayisi %d", searchCount); else{ printf("%d aramada bulundu", i+1); for(i=0; i<wordHash[address].textCount; i++) printf("\n%s",wordHash[address].textName[i]); } } } }while(strcmp(inputWord, "") != 0); }
bool QueryProcessor::translate(const string &str) { stringstream istr(str); bool previous_is_operator = true; while(!istr.eof()) { string t; istr >> t; if(!isStopWord(t)) { if(!previous_is_operator) { Query *p_Space = new QueryOperator(" "); p_querySet->push_back(p_Space); } Query *p = new Query(t); p_querySet->push_back(p); } else if(t == "and" || t == "or") { Query *p = new QueryOperator(t); p_querySet->push_back(p); previous_is_operator = true; } } if(p_querySet->empty()) { return false; } else { return true; } }
bool endWithStopWord(const std::vector<uint32_t>& termIdList) { if( termIdList.size() == 0 ) return false; if( termIdList.size() == 1 ) return isStopWord(termIdList[0]); uint32_t parent = db_.getRootID(); uint32_t child = 0; std::vector<uint32_t>::const_reverse_iterator it = termIdList.rbegin(); std::pair<bool, bool> r; while( it != termIdList.rend() ) { r = db_.isSuffix(*it, parent, child); if( !r.first ) { return false; } if( r.second ) { return true; } parent = child; ++it; } return false; }
int main(){ char **stopWords; //stop wordleri tutacak matris char documentList[15][15]; int stopWordsCount; //stop word sayisi int i, j, k=0; //genel amacli dongu indisleri int key; //tabloya yerlestirirken kullanilacak key degiskeni int address; //tabloya yerlestirirken kullanilacak adres degiskeni int fileCount; int searchCount; //kac adimda bulundu HASHTABLE *stopWordsHash; //stop wordleri tutacak hash table HASHTABLE2 *wordHash; //stop word listesindeki kelimeleri oku stopWords = readStopWords(stopWords, &stopWordsCount); stopWordsHash = (HASHTABLE *) malloc (M1 * sizeof(HASHTABLE)); wordHash = (HASHTABLE2 *) malloc (M2 * sizeof(HASHTABLE2)); for(i=0; i<M1; i++){ stopWordsHash[i].key = -1; strcpy(stopWordsHash[i].value, "bos"); } for(i=0; i<M2; i++){ wordHash[i].key = -1; wordHash[i].textCount = 0; strcpy(wordHash[i].value, "bos"); } //stop words icin hash table olustur //ARTIK GEREK KALMADI DOSYADAN OKUNACAK /*for(i=0; i<stopWordsCount; i++){ key = keyForHash(stopWords[i]); address = hash(key, 0, M1); //hash2 fonksiyonunun degeri ilk denemede sayilmaz //adres bos kelimeyi yerlestir if(stopWordsHash[address].key == -1){ strcpy(stopWordsHash[address].value, stopWords[i]); stopWordsHash[address].key = key; } else{//cakisma var yeni yer ara j = 1; k = 0; //printf("\ncakisma durumu\n"); k++; do{//bos yer gorene kadar don address = hash(key, j, M1); j++; //printf("\n%d",stopWordsHash[address].key); k++; }while(stopWordsHash[address].key != -1 && k<541); if(k==541) printf("\n%s kelimesi yerlestirilemedi", stopWords[i]); else{ strcpy(stopWordsHash[address].value, stopWords[i]); stopWordsHash[address].key = key; } } }*/ //stop words hash i dosyaya yaz //writeFileStopWordsHash(stopWordsHash, M1); //dosyaya yazili stop words hash table i oku readStopWordsHashTable(stopWordsHash, M1); //okunacak dosya listesini okur readDocumentList(documentList, &fileCount); for(i=0; i<fileCount; i++){ FILE *fp = fopen(documentList[i], "r"); char word[20]; while(fscanf(fp, "%s", word) != EOF){ toLowerCase(word); if(isStopWord(word, stopWordsHash, &searchCount, M2) == 0){ key = keyForHash(word); address = hash(key,0,M2); if(wordHash[address].key == -1){ strcpy(wordHash[address].value, word); wordHash[address].key = key; wordHash[address].where[i] = 1; strcpy(wordHash[address].textName[wordHash[address].textCount], documentList[i]); wordHash[address].textCount++; } else if(strcmp(word, wordHash[address].value) == 0){ //ayni kelime daha once eklenmis wordHash[address].where[i] = 1; if(strcmp(documentList[i],wordHash[address].textName[wordHash[address].textCount-1]) != 0){ strcpy(wordHash[address].textName[wordHash[address].textCount], documentList[i]); wordHash[address].textCount++; } } else{//cakisma var yeni yer ara j = 1; k = 0; //printf("\ncakisma durumu\n"); k++; do{//bos yer gorene kadar veya ayni kelimeyi bulana kadar don address = hash(key, j, M2); j++; //printf("\n%d",stopWordsHash[address].key); k++; }while((wordHash[address].key != -1 && strcmp(word, wordHash[address].value) != 0) && k<809); //sil //sil if(k==541) printf("\n%s kelimesi yerlestirilemedi", word); else if(wordHash[address].key == -1){ strcpy(wordHash[address].value, word); wordHash[address].key = key; wordHash[address].where[i] = 1; strcpy(wordHash[address].textName[wordHash[address].textCount], documentList[i]); wordHash[address].textCount++; } //kelime tabloda var ve ayni textte değil else if(strcmp(word, wordHash[address].value) == 0 && wordHash[address].where[i] ==0){ //printf("ayni kelime var %s %s \n", word, documentList[i] ); wordHash[address].where[i] = 1; strcpy(wordHash[address].textName[wordHash[address].textCount], documentList[i]); wordHash[address].textCount++; } } } } }//for sonu inputWord(stopWordsHash, wordHash); return 0; }//main sonu
/* Reads a file called fileName into an IndexInterface using rapidxml @param fileName - the name of the file to index @param index - the index to populate with information from the file */ bool XMLParser::readFileToIndex(string fileName, IndexInterface* index){ /* This is a block of setup code for rapidxml so that I can parse through the xml file efficiently */ rapidxml::xml_document<> doc; rapidxml::xml_node<> * root_node; ifstream theFile (fileName); vector<char> buffer((istreambuf_iterator<char>(theFile)), istreambuf_iterator<char>()); buffer.push_back('\0'); // Parse the buffer using the xml file parsing library into doc doc.parse<0>(&buffer[0]); // Find our root node root_node = doc.first_node("mediawiki"); //Save the number of documents that have been indexed int num = 0; for (rapidxml::xml_node<> * document_node = root_node->first_node("page"); document_node; document_node = document_node->next_sibling()){ //The text element of the current document node rapidxml::xml_node<> * text; //Grab the text and document name of the current document text = document_node->first_node("revision")->first_node("text"); string docName = document_node->first_node("title")->value(); //Ignore documents called user or file. They're garbage if(docName.substr(0, 4) == "User" || docName.substr(0,4) == "File") continue; //Find and save the author of the last revision string docAuthor; if(document_node->first_node("revision")->first_node("contributor") != nullptr && document_node->first_node("revision")->first_node("contributor")->first_node("username") != nullptr) docAuthor = document_node->first_node("revision")->first_node("contributor")->first_node("username")->value(); else docAuthor = "No author information given"; //Find and save the timestamp for the last revision string timestamp; if(document_node->first_node("revision")->first_node("timestamp") != nullptr) timestamp = document_node->first_node("revision")->first_node("timestamp")->value(); else timestamp = "No timestamp given"; string fileName = "SearchDocs/"; string docNameCopy = docName; replace(docNameCopy.begin(), docNameCopy.end(), '/', '.'); replace(docNameCopy.begin(), docNameCopy.end(), ' ', '_'); fileName = fileName + docNameCopy; ofstream writeDocFile(fileName); //Add this document to the index and save it's index in the index number-name registry int indexOfDoc = index->addDocument(docName, docAuthor, timestamp); //If we've successfully found a text node for this document if(text){ //Calculate file name for document file docNameCopy = docName; replace(docNameCopy.begin(), docNameCopy.end(), '/', ' '); replace(docNameCopy.begin(), docNameCopy.end(), ':', '.'); stringstream titleStream(docNameCopy); while(!titleStream.eof()){ //save the current word and add it to the index for the current document number string word; titleStream >> word; writeDocFile << word << " "; //Check if the word is a stop word or xml garbage if(!isStopWord(word) && !isXMLTag(word)){ //If not, remove capitalization, stem it, and index it word[0] = tolower(word[0]); Porter2Stemmer::stem(word); index->addWordForDocument(indexOfDoc, word); } } //Make a stringstream of the text element of the document node stringstream ss(text->value()); //Parse through the whole stringstream while(!ss.eof()){ //save the current word and add it to the index for the current document number string word; ss >> word; writeDocFile << word << " "; //Check if the word is a stop word or xml garbage if(!isStopWord(word) && !isXMLTag(word)){ //If not, remove capitalization, stem it, and index it word[0] = tolower(word[0]); Porter2Stemmer::stem(word); index->addWordForDocument(indexOfDoc, word); } } } //close ofstream for this doc writeDocFile.close(); cout << num++ << docName << endl; } return true; }