Пример #1
0
/**
* \ingroup search
*
* Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs
* SentID and offset are all 1-based
*
* Note:
*		The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement
*		To output it as a number, one needs to cast it to integer type for proper display
*
*
* Revision $Rev: 3794 $
* Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $
**/
int main(int argc, char * argv[]){
	//-----------------------------------------------------------------------------
	//check parameter
	if(argc<2){		
		fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n");
		fprintf(stderr,"\nUsage:\n");
		fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]);
		
		exit(-1);
	}

	//-----------------------------------------------------------------------------	

	C_SuffixArraySearchApplicationBase saObj;
	
	//load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
	saObj.loadData_forSearch(argv[1], false, false);


	cerr<<"Input N-grams:\n";
	char tmpString[10000];
	while(!cin.eof()){
	  cin.getline(tmpString,10000,'\n');
	  if(strlen(tmpString)>0){
		  vector<S_SimplePhraseLocationElement> locations;

		  locations = saObj.locateExactPhraseInCorpus(tmpString);
		  
		  if(locations.size()==0){
			  cout<<"No occurrences found.\n";
		  }
		  else{
			  for(int i=0;i<locations.size(); i++){
				  cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
			  }
		  }
		  cout<<endl;
	  }
	}

	return 0;
}
Пример #2
0
SentIdSet lookup_phrase(const std::string & phrase, C_SuffixArraySearchApplicationBase & my_sa)
{
    SentIdSet occur_set;
    vector<S_SimplePhraseLocationElement> locations;

    locations = my_sa.locateExactPhraseInCorpus(phrase.c_str());
    if(locations.size()==0) {
        cerr<<"No occurrences found!!\n";
    }
    for (vector<S_SimplePhraseLocationElement>::iterator i=locations.begin(); i != locations.end(); ++i) {
        occur_set.push_back(i->sentIdInCorpus);
    }
    
    std::sort(occur_set.begin(), occur_set.end());
    SentIdSet::iterator it = std::unique(occur_set.begin(), occur_set.end());
    occur_set.resize(it - occur_set.begin());
    
    return occur_set;
}
Пример #3
0
int main(int argc, char * argv[])
{
  int c;
  const char* efile=0;
  const char* ffile=0;
  int pfe_index = 2;
  while ((c = getopt(argc, argv, "cpf:e:i:n:l:m:h")) != -1) {
    switch (c) {
    case 'e':
      efile = optarg;
      break;
    case 'f':
      ffile = optarg;
      break;
    case 'i':  // index of pfe in phrase table
      pfe_index = atoi(optarg);
      break;
    case 'n':  // keep only the top n entries in phrase table sorted by p(f|e) (0=all)
      pfe_filter_limit = atoi(optarg);
      std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
      break;
    case 'c':
      print_cooc_counts = true;
      break;
    case 'p':
      print_neglog_significance = true;
      break;
    case 'h':
      hierarchical = true;
      break;
    case 'm':
      max_cache = atoi(optarg);
      break;
    case 'l':
      std::cerr << "-l = " << optarg << "\n";
      if (strcmp(optarg,"a+e") == 0) {
        sig_filter_limit = ALPHA_PLUS_EPS;
      } else if (strcmp(optarg,"a-e") == 0) {
        sig_filter_limit = ALPHA_MINUS_EPS;
      } else {
        char *x;
        sig_filter_limit = strtod(optarg, &x);
        if (sig_filter_limit < 0.0) {
          std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n";
          usage();
        }
      }
      break;
    default:
      usage();
    }
  }
  if (sig_filter_limit == 0.0) pef_filter_only = true;
  //-----------------------------------------------------------------------------
  if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) {
    usage();
  }

  //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
  if (!pef_filter_only) {
    e_sa.loadData_forSearch(efile, false, false);
    f_sa.loadData_forSearch(ffile, false, false);
    size_t elines = e_sa.returnTotalSentNumber();
    size_t flines = f_sa.returnTotalSentNumber();
    if (elines != flines) {
      std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n";
      usage();
    } else {
      std::cerr << "Training corpus: " << elines << " lines\n";
      num_lines = elines;
    }
    p_111 = -log(fisher_exact(1,1,1));
    std::cerr << "\\alpha = " << p_111 << "\n";
    if (sig_filter_limit == ALPHA_MINUS_EPS) {
      sig_filter_limit = p_111 - 0.001;
    } else if (sig_filter_limit == ALPHA_PLUS_EPS) {
      sig_filter_limit = p_111 + 0.001;
    }
    std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n";
  } else {
    std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl;
  }

  char tmpString[10000];
  std::string prev = "";
  std::vector<PTEntry*> options;
  size_t pt_lines = 0;
  while(!cin.eof()) {
    cin.getline(tmpString,10000,'\n');
    if(++pt_lines%10000==0) { 
      std::cerr << ".";
      
      prune_cache(esets);
      prune_cache(fsets);
      
      if(pt_lines%500000==0) 
        std::cerr << "[n:"<<pt_lines<<"]\n";
    }

    if(strlen(tmpString)>0) {
      PTEntry* pp = new PTEntry(tmpString, pfe_index);
      if (prev != pp->f_phrase) {
        prev = pp->f_phrase;

        if (!options.empty()) {  // always true after first line
          compute_cooc_stats_and_filter(options);
        }
        for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
          std::cout << **i << std::endl;
          delete *i;
        }
        options.clear();
        options.push_back(pp);

      } else {
        options.push_back(pp);
      }
      //			  for(int i=0;i<locations.size(); i++){
      //				  cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
      //			  }
    }
  }
  compute_cooc_stats_and_filter(options);
  for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
    std::cout << **i << std::endl;
    delete *i;
  }
  float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
  float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
  std::cerr << "\n\n------------------------------------------------------\n"
            << "  unfiltered phrases pairs: " << pt_lines << "\n"
            << "\n"
            << "     P(f|e) filter [first]: " << nremoved_pfefilter << "   (" << pfefper << "%)\n"
            << "       significance filter: " << nremoved_sigfilter << "   (" << sigfper << "%)\n"
            << "            TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << "   (" << (sigfper + pfefper) << "%)\n"
            << "\n"
            << "     FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << "   (" << (100.0-sigfper - pfefper) << "%)\n"
            << "------------------------------------------------------\n";

  return 0;
}