/* This api takes the user input - and looks up the word[s] in the inverted index. The candidate documents are then ranked based on tf-idf BOW (bag of words) model */ std::vector<std::pair<int,double> > ServeIndex(const std::string& word,int topK) { //tokenize and normalize the user text std::vector<std::string>& word_tokens = _wordBreaker->BreakEnglishText(word.c_str()); std::vector<std::pair<int,double> > results; //generate the candidate document set std::set<int> candSet; bool foundAny = false; for(size_t i=0;i<word_tokens.size();++i) { boost::unordered_map<std::string,IndexEntry>::iterator itor = _indexPtr->_wordIndex.find(word_tokens[i]); if( itor == _indexPtr->_wordIndex.end() ) continue; else{ //first entry which was found if(!foundAny){ candSet = itor->second._docSet; foundAny = true; } else{ std::set<int> temp; set_intersection(candSet.begin(),candSet.end(),(itor->second)._docSet.begin(),(itor->second)._docSet.end(),inserter(temp,temp.begin())); candSet.clear(); candSet = temp; } } } return Rank(word_tokens,candSet,topK); }
/* This api takes a tsv file - containing 'documents' and generates an in-memory inverted index. The index is dumped to disk using the Serialize api. */ void BuildIndex() { std::ifstream inFile(docPath.c_str(),std::ifstream::in); if(!inFile.good()){ std::cerr<<"Unable to read file: " <<docPath<<std::endl; exit(-1); } inFile.sync_with_stdio(false); std::string line; int docId = 0; while( getline(inFile,line)) { if( docId % 10000 == 0) std::cerr<<"At document: " <<docId<<std::endl; std::vector<char* > tempVector; char* parts = strtok( (char*)line.c_str() , delim); while( parts != NULL) { tempVector.push_back(parts); parts = strtok(NULL,delim); } assert(tempVector.size() == 3); std::vector<std::string>& tokens = _wordBreaker->BreakEnglishText(tempVector[2]); AddDocToIndex(docId,tempVector[1],tokens); ++docId; } _totalDocs = docId; }