Beispiel #1
0
SentIdSet lookup_phrase(const std::string & phrase, C_SuffixArraySearchApplicationBase & my_sa)
{
    SentIdSet occur_set;
    vector<S_SimplePhraseLocationElement> locations;

    locations = my_sa.locateExactPhraseInCorpus(phrase.c_str());
    if(locations.size()==0) {
        cerr<<"No occurrences found!!\n";
    }
    for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin(); i != locations.end(); ++i) {
        occur_set.push_back(i->sentIdInCorpus);
    }
    
    std::sort(occur_set.begin(), occur_set.end());
    SentIdSet::iterator it = std::unique(occur_set.begin(), occur_set.end());
    occur_set.resize(it - occur_set.begin());
    
    return occur_set;
}
Beispiel #2
0
/**
* \ingroup search
*
* Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs
* SentID and offset are all 1-based
*
* Note:
*		The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement
*		To output it as a number, one needs to cast it to integer type for proper display
*
*
* Revision $Rev: 3794 $
* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
**/
int main(int argc, char * argv[]){
	//-----------------------------------------------------------------------------
	//check parameter
	if(argc<2){		
		fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n");
		fprintf(stderr,"\nUsage:\n");
		fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]);
		
		exit(-1);
	}

	//-----------------------------------------------------------------------------	

	C_SuffixArraySearchApplicationBase saObj;
	
	//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
	saObj.loadData_forSearch(argv[1], false, false);


	cerr<<"Input N-grams:\n";
	char tmpString[10000];
	while(!cin.eof()){
	  cin.getline(tmpString,10000,'\n');
	  if(strlen(tmpString)>0){
		  vector<S_SimplePhraseLocationElement> locations;

		  locations = saObj.locateExactPhraseInCorpus(tmpString);
		  
		  if(locations.size()==0){
			  cout<<"No occurrences found.\n";
		  }
		  else{
			  for(int i=0;i<locations.size(); i++){
				  cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
			  }
		  }
		  cout<<endl;
	  }
	}

	return 0;
}