// input: unordered list of translation options for a single source phrase void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options) { if (pfe_filter_limit>0 && options.size() > pfe_filter_limit) { nremoved_pfefilter += (options.size() - pfe_filter_limit); std::nth_element(options.begin(), options.begin()+pfe_filter_limit, options.end(), PfeComparer()); for (std::vector<PTEntry*>::iterator i=options.begin()+pfe_filter_limit; i != options.end(); ++i) delete *i; options.erase(options.begin()+pfe_filter_limit,options.end()); } if (pef_filter_only) return; // std::cerr << "f phrase: " << options.front()->f_phrase << "\n"; SentIdSet fset; fset = find_occurrences(options.front()->f_phrase, f_sa, fsets); size_t cf = fset.size(); for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) { const std::string& e_phrase = (*i)->e_phrase; size_t cef=0; ClockedSentIdSet& clocked_eset = esets[e_phrase]; SentIdSet & eset = clocked_eset.first; clocked_eset.second = clock(); if (eset.empty()) { eset = find_occurrences(e_phrase, e_sa, esets); //std::cerr << "Looking up e-phrase: " << e_phrase << "\n"; } size_t ce=eset.size(); if (ce < cf) { for (SentIdSet::iterator i=eset.begin(); i != eset.end(); ++i) { if (std::binary_search(fset.begin(), fset.end(), *i)) cef++; } } else { for (SentIdSet::iterator i=fset.begin(); i != fset.end(); ++i) { if (std::binary_search(eset.begin(), eset.end(), *i)) cef++; } } double nlp = -log(fisher_exact(cef, cf, ce)); (*i)->set_cooc_stats(cef, cf, ce, nlp); if (ce < MINIMUM_SIZE_TO_KEEP) { esets.erase(e_phrase); } } std::vector<PTEntry*>::iterator new_end = std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit)); nremoved_sigfilter += (options.end() - new_end); options.erase(new_end,options.end()); }
void getFstFromMs() { std::cerr << "Calculating Fst using variants from: " << opt::msFile << std::endl; std::cerr << "and outputting chi-sq test p-vals < " << opt::msPvalCutoff << std::endl; std::ifstream* msFile = new std::ifstream(opt::msFile.c_str()); string fileRoot = stripExtension(opt::msFile); string PvalFileName = fileRoot + "_" + opt::runName + "_pvals.txt"; std::ofstream* pValFile; if (opt::msPvalCutoff > 0) { pValFile = new std::ofstream(PvalFileName.c_str()); *pValFile << "Fisher p-val" << "\t" << "chi-sq pval" << "\t" << "set1Alt" << "\t" << "set1Ref" << "\t" << "set2Alt" << "\t" << "set2Ref" << "\t" << "Fst" << std::endl; } std::vector<int> set1_loci; std::vector<int> set2_loci; srand((int)time(NULL)); if (opt::msSet1FstSample == 0) { opt::msSet1FstSample = opt::msSet1Size; for (int i = 0; i != opt::msSet1FstSample; i++) { set1_loci.push_back(i); } } else { // Randomly sample individuals from population 1 for Fst calculation for (int i = 0; i != opt::msSet1FstSample; i++) { int rand_sample = (rand()%opt::msSet1Size); while (std::find(set1_loci.begin(),set1_loci.end(),rand_sample) != set1_loci.end()) { rand_sample = (rand()%opt::msSet1Size); } set1_loci.push_back(rand_sample); } } // Do the same for set2 if (opt::msSet2FstSample == 0) { opt::msSet2FstSample = opt::msSet2Size; for (int i = 0; i != opt::msSet2FstSample; i++) { set2_loci.push_back(i+opt::msSet1Size); } } else { // Randomly sample individuals from population 2 for Fst calculation for (int i = 0; i != opt::msSet2FstSample; i++) { int rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size; while (std::find(set2_loci.begin(),set2_loci.end(),rand_sample) != set2_loci.end()) { rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size; } set2_loci.push_back(rand_sample); } } std::cerr << "Selected population 1 individuals: "; print_vector_stream(set1_loci, std::cerr); std::cerr << "Selected population 2 individuals: "; print_vector_stream(set2_loci, std::cerr); if (opt::msSet1Size != opt::msSet1FstSample || opt::msSet2Size != opt::msSet2FstSample) { std::cerr << "Warning: the Fst column is going to contain '-1' values where the site is not a segregating site in the sampled individuals for Fst calcultation" << std::endl; } std::vector<double> fstNumerators; fstNumerators.reserve(500000000); std::vector<double> fstDenominators; fstDenominators.reserve(500000000); string line; int numFixedSites = 0; int numNearlyFixedSites = 0; std::vector<double> nullForChisq; std::vector<int> moreSet1; std::vector<int> lessSet1; std::vector<int> moreSet2; std::vector<int> lessSet2; SetCounts counts; while (getline(*msFile, line)) { counts.reset(); double thisFst = -1; for (std::vector<int>::iterator it = set1_loci.begin(); it != set1_loci.end(); it++) { // std::cerr << line[*it] << std::endl; if (line[*it] == '1') { counts.set1Count++; } } for (std::vector<int>::iterator it = set2_loci.begin(); it != set2_loci.end(); it++) { if (line[*it] == '1') { counts.set2Count++; } } //std::cerr << "counts.set1Count" << counts.set1Count << "\t" << "counts.set2Count" << counts.set2Count << std::endl; if (counts.set1Count > 0 || counts.set2Count > 0) { double FstNum = calculateFstNumerator(counts, opt::msSet1FstSample, opt::msSet2FstSample); double FstDenom = calculateFstDenominator(counts, opt::msSet1FstSample, opt::msSet2FstSample); thisFst = FstNum/FstDenom; if (thisFst < 0) thisFst = 0; fstNumerators.push_back(FstNum); fstDenominators.push_back(FstDenom); } if ((counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 0)) { numFixedSites++; } if ((counts.set1Count == 1 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample-1) || (counts.set1Count == opt::msSet1FstSample-1 && counts.set2Count == 0) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 1)) { numNearlyFixedSites++; } int set1WithoutVariant = opt::msSet1FstSample-counts.set1Count; int set2WithoutVariant = opt::msSet2FstSample-counts.set2Count; if (counts.set1Count >= set1WithoutVariant) { moreSet1.push_back(counts.set1Count); lessSet1.push_back(set1WithoutVariant); moreSet2.push_back(counts.set2Count); lessSet2.push_back(set2WithoutVariant); } else { moreSet1.push_back(set1WithoutVariant); lessSet1.push_back(counts.set1Count); moreSet2.push_back(set2WithoutVariant); lessSet2.push_back(counts.set2Count); } // std::cerr << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << std::endl; if ((counts.set1Count != 0 || counts.set2Count != 0) && (set1WithoutVariant != 0 || set2WithoutVariant != 0)) { if (opt::msSet1FstSample + opt::msSet2FstSample <= 60) { counts.fisher_pval = fisher_exact(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant); // std::cerr << "Fisher: " << counts.fisher_pval << std::endl; counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant); } else { counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant); } } if (counts.fisher_pval < opt::msPvalCutoff || counts.chi_sq_pval < opt::msPvalCutoff) { *pValFile << counts.fisher_pval << "\t" << counts.chi_sq_pval << "\t" << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << "\t" << thisFst << std::endl; } } double Fst = calculateFst(fstNumerators, fstDenominators); std::cerr << "Fst: " << Fst << std::endl; std::cerr << "Fixed sites: " << numFixedSites << std::endl; std::cerr << "Tier2 sites: " << numNearlyFixedSites << std::endl; std::cerr << "Null ChiSq 1:" << vector_average(moreSet1)/opt::msSet1FstSample << "\t" << vector_average(lessSet1)/opt::msSet1FstSample << std::endl; std::cerr << "Null ChiSq 2:" << vector_average(moreSet2)/opt::msSet2FstSample << "\t" << vector_average(lessSet2)/opt::msSet2FstSample << std::endl; }
int main(int argc, char * argv[]) { int c; const char* efile=0; const char* ffile=0; int pfe_index = 2; while ((c = getopt(argc, argv, "cpf:e:i:n:l:m:h")) != -1) { switch (c) { case 'e': efile = optarg; break; case 'f': ffile = optarg; break; case 'i': // index of pfe in phrase table pfe_index = atoi(optarg); break; case 'n': // keep only the top n entries in phrase table sorted by p(f|e) (0=all) pfe_filter_limit = atoi(optarg); std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl; break; case 'c': print_cooc_counts = true; break; case 'p': print_neglog_significance = true; break; case 'h': hierarchical = true; break; case 'm': max_cache = atoi(optarg); break; case 'l': std::cerr << "-l = " << optarg << "\n"; if (strcmp(optarg,"a+e") == 0) { sig_filter_limit = ALPHA_PLUS_EPS; } else if (strcmp(optarg,"a-e") == 0) { sig_filter_limit = ALPHA_MINUS_EPS; } else { char *x; sig_filter_limit = strtod(optarg, &x); if (sig_filter_limit < 0.0) { std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n"; usage(); } } break; default: usage(); } } if (sig_filter_limit == 0.0) pef_filter_only = true; //----------------------------------------------------------------------------- if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) { usage(); } //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false) if (!pef_filter_only) { e_sa.loadData_forSearch(efile, false, false); f_sa.loadData_forSearch(ffile, false, false); size_t elines = e_sa.returnTotalSentNumber(); size_t flines = f_sa.returnTotalSentNumber(); if (elines != flines) { std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n"; usage(); } else { std::cerr << "Training corpus: " << elines << " lines\n"; num_lines = elines; } p_111 = -log(fisher_exact(1,1,1)); std::cerr << "\\alpha = " << p_111 << "\n"; if (sig_filter_limit == ALPHA_MINUS_EPS) { sig_filter_limit = p_111 - 0.001; } else if (sig_filter_limit == ALPHA_PLUS_EPS) { sig_filter_limit = p_111 + 0.001; } std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n"; } else { std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl; } char tmpString[10000]; std::string prev = ""; std::vector<PTEntry*> options; size_t pt_lines = 0; while(!cin.eof()) { cin.getline(tmpString,10000,'\n'); if(++pt_lines%10000==0) { std::cerr << "."; prune_cache(esets); prune_cache(fsets); if(pt_lines%500000==0) std::cerr << "[n:"<<pt_lines<<"]\n"; } if(strlen(tmpString)>0) { PTEntry* pp = new PTEntry(tmpString, pfe_index); if (prev != pp->f_phrase) { prev = pp->f_phrase; if (!options.empty()) { // always true after first line compute_cooc_stats_and_filter(options); } for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) { std::cout << **i << std::endl; delete *i; } options.clear(); options.push_back(pp); } else { options.push_back(pp); } // for(int i=0;i<locations.size(); i++){ // cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl; // } } } compute_cooc_stats_and_filter(options); for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) { std::cout << **i << std::endl; delete *i; } float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines; float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines; std::cerr << "\n\n------------------------------------------------------\n" << " unfiltered phrases pairs: " << pt_lines << "\n" << "\n" << " P(f|e) filter [first]: " << nremoved_pfefilter << " (" << pfefper << "%)\n" << " significance filter: " << nremoved_sigfilter << " (" << sigfper << "%)\n" << " TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << " (" << (sigfper + pfefper) << "%)\n" << "\n" << " FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << " (" << (100.0-sigfper - pfefper) << "%)\n" << "------------------------------------------------------\n"; return 0; }