/** * \ingroup search * * Locate an n-gram in the indexed corpus, return its locations as <sentId, offsetInSent> pairs * SentID and offset are all 1-based * * Note: * The offset of the n-gram in a sentence is represented as "char" in the returned structure S_SimplePhraseLocationElement * To output it as a number, one needs to cast it to integer type for proper display * * * Revision $Rev: 3794 $ * Last Modified $LastChangedDate: 2007-06-29 02:17:32 -0400 (Fri, 29 Jun 2007) $ **/ int main(int argc, char * argv[]){ //----------------------------------------------------------------------------- //check parameter if(argc<2){ fprintf(stderr,"\nOutput all the locations of an n-gram in an indexed corpus\n"); fprintf(stderr,"\nUsage:\n"); fprintf(stderr,"\n%s corpusFileNameStem < list of n-grams\n\n",argv[0]); exit(-1); } //----------------------------------------------------------------------------- C_SuffixArraySearchApplicationBase saObj; //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false) saObj.loadData_forSearch(argv[1], false, false); cerr<<"Input N-grams:\n"; char tmpString[10000]; while(!cin.eof()){ cin.getline(tmpString,10000,'\n'); if(strlen(tmpString)>0){ vector<S_SimplePhraseLocationElement> locations; locations = saObj.locateExactPhraseInCorpus(tmpString); if(locations.size()==0){ cout<<"No occurrences found.\n"; } else{ for(int i=0;i<locations.size(); i++){ cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl; } } cout<<endl; } } return 0; }
int main(int argc, char * argv[]) { int c; const char* efile=0; const char* ffile=0; int pfe_index = 2; while ((c = getopt(argc, argv, "cpf:e:i:n:l:m:h")) != -1) { switch (c) { case 'e': efile = optarg; break; case 'f': ffile = optarg; break; case 'i': // index of pfe in phrase table pfe_index = atoi(optarg); break; case 'n': // keep only the top n entries in phrase table sorted by p(f|e) (0=all) pfe_filter_limit = atoi(optarg); std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl; break; case 'c': print_cooc_counts = true; break; case 'p': print_neglog_significance = true; break; case 'h': hierarchical = true; break; case 'm': max_cache = atoi(optarg); break; case 'l': std::cerr << "-l = " << optarg << "\n"; if (strcmp(optarg,"a+e") == 0) { sig_filter_limit = ALPHA_PLUS_EPS; } else if (strcmp(optarg,"a-e") == 0) { sig_filter_limit = ALPHA_MINUS_EPS; } else { char *x; sig_filter_limit = strtod(optarg, &x); if (sig_filter_limit < 0.0) { std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n"; usage(); } } break; default: usage(); } } if (sig_filter_limit == 0.0) pef_filter_only = true; //----------------------------------------------------------------------------- if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) { usage(); } //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false) if (!pef_filter_only) { e_sa.loadData_forSearch(efile, false, false); f_sa.loadData_forSearch(ffile, false, false); size_t elines = e_sa.returnTotalSentNumber(); size_t flines = f_sa.returnTotalSentNumber(); if (elines != flines) { std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n"; usage(); } else { std::cerr << "Training corpus: " << elines << " lines\n"; num_lines = elines; } p_111 = -log(fisher_exact(1,1,1)); std::cerr << "\\alpha = " << p_111 << "\n"; if (sig_filter_limit == ALPHA_MINUS_EPS) { sig_filter_limit = p_111 - 0.001; } else if (sig_filter_limit == ALPHA_PLUS_EPS) { sig_filter_limit = p_111 + 0.001; } std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n"; } else { std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl; } char tmpString[10000]; std::string prev = ""; std::vector<PTEntry*> options; size_t pt_lines = 0; while(!cin.eof()) { cin.getline(tmpString,10000,'\n'); if(++pt_lines%10000==0) { std::cerr << "."; prune_cache(esets); prune_cache(fsets); if(pt_lines%500000==0) std::cerr << "[n:"<<pt_lines<<"]\n"; } if(strlen(tmpString)>0) { PTEntry* pp = new PTEntry(tmpString, pfe_index); if (prev != pp->f_phrase) { prev = pp->f_phrase; if (!options.empty()) { // always true after first line compute_cooc_stats_and_filter(options); } for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) { std::cout << **i << std::endl; delete *i; } options.clear(); options.push_back(pp); } else { options.push_back(pp); } // for(int i=0;i<locations.size(); i++){ // cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl; // } } } compute_cooc_stats_and_filter(options); for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) { std::cout << **i << std::endl; delete *i; } float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines; float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines; std::cerr << "\n\n------------------------------------------------------\n" << " unfiltered phrases pairs: " << pt_lines << "\n" << "\n" << " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n" << " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n" << " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n" << "\n" << " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n" << "------------------------------------------------------\n"; return 0; }