예제 #1
0
// input: unordered list of translation options for a single source phrase
void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
{
  if (pfe_filter_limit>0 && options.size() > pfe_filter_limit) {
    nremoved_pfefilter += (options.size() - pfe_filter_limit);
    std::nth_element(options.begin(), options.begin()+pfe_filter_limit, options.end(), PfeComparer());
    for (std::vector<PTEntry*>::iterator i=options.begin()+pfe_filter_limit; i != options.end(); ++i)
      delete *i;
    options.erase(options.begin()+pfe_filter_limit,options.end());
  }
  if (pef_filter_only) return;
//   std::cerr << "f phrase: " << options.front()->f_phrase << "\n";
  SentIdSet fset;
  fset = find_occurrences(options.front()->f_phrase, f_sa, fsets);
  size_t cf = fset.size();
  for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
    const std::string& e_phrase = (*i)->e_phrase;
    size_t cef=0;
    ClockedSentIdSet& clocked_eset = esets[e_phrase];
    SentIdSet & eset = clocked_eset.first;
    clocked_eset.second = clock();
    if (eset.empty()) {
        eset = find_occurrences(e_phrase, e_sa, esets);
        //std::cerr << "Looking up e-phrase: " << e_phrase << "\n";
    }
    size_t ce=eset.size();
    if (ce < cf) {
      for (SentIdSet::iterator i=eset.begin(); i != eset.end(); ++i) {
        if (std::binary_search(fset.begin(), fset.end(), *i)) cef++;
      }
    } else {
      for (SentIdSet::iterator i=fset.begin(); i != fset.end(); ++i) {
        if (std::binary_search(eset.begin(), eset.end(), *i)) cef++;
      }
    }
    double nlp = -log(fisher_exact(cef, cf, ce));
    (*i)->set_cooc_stats(cef, cf, ce, nlp);
    if (ce < MINIMUM_SIZE_TO_KEEP) {
      esets.erase(e_phrase);
    }

  }
  std::vector<PTEntry*>::iterator new_end =
    std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit));
  nremoved_sigfilter += (options.end() - new_end);
  options.erase(new_end,options.end());
}
예제 #2
0
void getFstFromMs() {
    std::cerr << "Calculating Fst using variants from: " << opt::msFile << std::endl;
    std::cerr << "and outputting chi-sq test p-vals < " << opt::msPvalCutoff << std::endl;
    
    std::ifstream* msFile = new std::ifstream(opt::msFile.c_str());
    string fileRoot = stripExtension(opt::msFile);
    string PvalFileName = fileRoot + "_" + opt::runName + "_pvals.txt";
    std::ofstream* pValFile;
    if (opt::msPvalCutoff > 0) {
        pValFile = new std::ofstream(PvalFileName.c_str());
        *pValFile << "Fisher p-val" << "\t" << "chi-sq pval" << "\t" << "set1Alt" << "\t" << "set1Ref" << "\t" << "set2Alt" << "\t" << "set2Ref" << "\t" << "Fst" << std::endl;
    }
    
    
    std::vector<int> set1_loci;
    std::vector<int> set2_loci;
    srand((int)time(NULL));
    if (opt::msSet1FstSample == 0) {
        opt::msSet1FstSample = opt::msSet1Size;
        for (int i = 0; i != opt::msSet1FstSample; i++) {
            set1_loci.push_back(i);
        }
    } else { // Randomly sample individuals from population 1 for Fst calculation
        for (int i = 0; i != opt::msSet1FstSample; i++) {
            int rand_sample = (rand()%opt::msSet1Size);
            while (std::find(set1_loci.begin(),set1_loci.end(),rand_sample) != set1_loci.end()) {
                rand_sample = (rand()%opt::msSet1Size);
            }
            set1_loci.push_back(rand_sample);
        }
    }
    // Do the same for set2
    if (opt::msSet2FstSample == 0) {
        opt::msSet2FstSample = opt::msSet2Size;
        for (int i = 0; i != opt::msSet2FstSample; i++) {
            set2_loci.push_back(i+opt::msSet1Size);
        }
    } else { // Randomly sample individuals from population 2 for Fst calculation
        for (int i = 0; i != opt::msSet2FstSample; i++) {
            int rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size;
            while (std::find(set2_loci.begin(),set2_loci.end(),rand_sample) != set2_loci.end()) {
                rand_sample = (rand()%opt::msSet2Size)+opt::msSet1Size;
            }
            set2_loci.push_back(rand_sample);
        }
    }

    std::cerr << "Selected population 1 individuals: "; print_vector_stream(set1_loci, std::cerr);
    std::cerr << "Selected population 2 individuals: "; print_vector_stream(set2_loci, std::cerr);
    
    if (opt::msSet1Size != opt::msSet1FstSample || opt::msSet2Size != opt::msSet2FstSample) {
        std::cerr << "Warning: the Fst column is going to contain '-1' values where the site is not a segregating site in the sampled individuals for Fst calcultation" << std::endl;
    }
    
    std::vector<double> fstNumerators; fstNumerators.reserve(500000000);
    std::vector<double> fstDenominators; fstDenominators.reserve(500000000);
    
    
    string line;
    int numFixedSites = 0;
    int numNearlyFixedSites = 0;
    std::vector<double> nullForChisq;
    std::vector<int> moreSet1;
    std::vector<int> lessSet1;
    std::vector<int> moreSet2;
    std::vector<int> lessSet2;
    SetCounts counts;
    while (getline(*msFile, line)) {
        counts.reset();
        double thisFst = -1;
        for (std::vector<int>::iterator it = set1_loci.begin(); it != set1_loci.end(); it++) {
            // std::cerr << line[*it] << std::endl;
            if (line[*it] == '1') {
                counts.set1Count++;
            }
        }
        for (std::vector<int>::iterator it = set2_loci.begin(); it != set2_loci.end(); it++) {
            if (line[*it] == '1') {
                counts.set2Count++;
            }
        }
        
        //std::cerr << "counts.set1Count" << counts.set1Count << "\t" << "counts.set2Count" << counts.set2Count << std::endl;
        
        if (counts.set1Count > 0 || counts.set2Count > 0) {
            double FstNum = calculateFstNumerator(counts, opt::msSet1FstSample, opt::msSet2FstSample);
            double FstDenom = calculateFstDenominator(counts, opt::msSet1FstSample, opt::msSet2FstSample);
            thisFst = FstNum/FstDenom; if (thisFst < 0) thisFst = 0;
            fstNumerators.push_back(FstNum);
            fstDenominators.push_back(FstDenom);
            
        }
        
        if ((counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 0)) {
            numFixedSites++;
        }
        
        if ((counts.set1Count == 1 && counts.set2Count == opt::msSet2FstSample) || (counts.set1Count == 0 && counts.set2Count == opt::msSet2FstSample-1) ||
            (counts.set1Count == opt::msSet1FstSample-1 && counts.set2Count == 0) || (counts.set1Count == opt::msSet1FstSample && counts.set2Count == 1)) {
            numNearlyFixedSites++;
        }
        
        
        int set1WithoutVariant = opt::msSet1FstSample-counts.set1Count;
        int set2WithoutVariant = opt::msSet2FstSample-counts.set2Count;
        
        if (counts.set1Count >= set1WithoutVariant) {
            moreSet1.push_back(counts.set1Count);
            lessSet1.push_back(set1WithoutVariant);
            moreSet2.push_back(counts.set2Count);
            lessSet2.push_back(set2WithoutVariant);
        } else {
            moreSet1.push_back(set1WithoutVariant);
            lessSet1.push_back(counts.set1Count);
            moreSet2.push_back(set2WithoutVariant);
            lessSet2.push_back(counts.set2Count);
        }
        
       // std::cerr << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << std::endl;
        if ((counts.set1Count != 0 || counts.set2Count != 0) && (set1WithoutVariant != 0 || set2WithoutVariant != 0)) {
            if (opt::msSet1FstSample + opt::msSet2FstSample <= 60) {
                counts.fisher_pval = fisher_exact(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant);
   //             std::cerr << "Fisher: " << counts.fisher_pval << std::endl;
                counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant);
            } else {
                counts.chi_sq_pval = pearson_chi_sq_indep(counts.set1Count,set1WithoutVariant , counts.set2Count, set2WithoutVariant);
            }
        }
        
        if (counts.fisher_pval < opt::msPvalCutoff || counts.chi_sq_pval < opt::msPvalCutoff) {
            *pValFile << counts.fisher_pval << "\t" << counts.chi_sq_pval << "\t" << counts.set1Count << "\t" << set1WithoutVariant << "\t" << counts.set2Count << "\t" << set2WithoutVariant << "\t" << thisFst << std::endl;
        }
    }
    
    double Fst = calculateFst(fstNumerators, fstDenominators);
    std::cerr << "Fst: " << Fst << std::endl;
    std::cerr << "Fixed sites: " << numFixedSites << std::endl;
    std::cerr << "Tier2 sites: " << numNearlyFixedSites << std::endl;
    std::cerr << "Null ChiSq 1:" << vector_average(moreSet1)/opt::msSet1FstSample << "\t" << vector_average(lessSet1)/opt::msSet1FstSample << std::endl;
    std::cerr << "Null ChiSq 2:" << vector_average(moreSet2)/opt::msSet2FstSample << "\t" << vector_average(lessSet2)/opt::msSet2FstSample << std::endl;
}
예제 #3
0
int main(int argc, char * argv[])
{
  int c;
  const char* efile=0;
  const char* ffile=0;
  int pfe_index = 2;
  while ((c = getopt(argc, argv, "cpf:e:i:n:l:m:h")) != -1) {
    switch (c) {
    case 'e':
      efile = optarg;
      break;
    case 'f':
      ffile = optarg;
      break;
    case 'i':  // index of pfe in phrase table
      pfe_index = atoi(optarg);
      break;
    case 'n':  // keep only the top n entries in phrase table sorted by p(f|e) (0=all)
      pfe_filter_limit = atoi(optarg);
      std::cerr << "P(f|e) filter limit: " << pfe_filter_limit << std::endl;
      break;
    case 'c':
      print_cooc_counts = true;
      break;
    case 'p':
      print_neglog_significance = true;
      break;
    case 'h':
      hierarchical = true;
      break;
    case 'm':
      max_cache = atoi(optarg);
      break;
    case 'l':
      std::cerr << "-l = " << optarg << "\n";
      if (strcmp(optarg,"a+e") == 0) {
        sig_filter_limit = ALPHA_PLUS_EPS;
      } else if (strcmp(optarg,"a-e") == 0) {
        sig_filter_limit = ALPHA_MINUS_EPS;
      } else {
        char *x;
        sig_filter_limit = strtod(optarg, &x);
        if (sig_filter_limit < 0.0) {
          std::cerr << "Filter limit (-l) must be either 'a+e', 'a-e' or a real number >= 0.0\n";
          usage();
        }
      }
      break;
    default:
      usage();
    }
  }
  if (sig_filter_limit == 0.0) pef_filter_only = true;
  //-----------------------------------------------------------------------------
  if (optind != argc || ((!efile || !ffile) && !pef_filter_only)) {
    usage();
  }

  //load the indexed corpus with vocabulary(noVoc=false) and with offset(noOffset=false)
  if (!pef_filter_only) {
    e_sa.loadData_forSearch(efile, false, false);
    f_sa.loadData_forSearch(ffile, false, false);
    size_t elines = e_sa.returnTotalSentNumber();
    size_t flines = f_sa.returnTotalSentNumber();
    if (elines != flines) {
      std::cerr << "Number of lines in e-corpus != number of lines in f-corpus!\n";
      usage();
    } else {
      std::cerr << "Training corpus: " << elines << " lines\n";
      num_lines = elines;
    }
    p_111 = -log(fisher_exact(1,1,1));
    std::cerr << "\\alpha = " << p_111 << "\n";
    if (sig_filter_limit == ALPHA_MINUS_EPS) {
      sig_filter_limit = p_111 - 0.001;
    } else if (sig_filter_limit == ALPHA_PLUS_EPS) {
      sig_filter_limit = p_111 + 0.001;
    }
    std::cerr << "Sig filter threshold is = " << sig_filter_limit << "\n";
  } else {
    std::cerr << "Filtering using P(e|f) only. n=" << pfe_filter_limit << std::endl;
  }

  char tmpString[10000];
  std::string prev = "";
  std::vector<PTEntry*> options;
  size_t pt_lines = 0;
  while(!cin.eof()) {
    cin.getline(tmpString,10000,'\n');
    if(++pt_lines%10000==0) { 
      std::cerr << ".";
      
      prune_cache(esets);
      prune_cache(fsets);
      
      if(pt_lines%500000==0) 
        std::cerr << "[n:"<<pt_lines<<"]\n";
    }

    if(strlen(tmpString)>0) {
      PTEntry* pp = new PTEntry(tmpString, pfe_index);
      if (prev != pp->f_phrase) {
        prev = pp->f_phrase;

        if (!options.empty()) {  // always true after first line
          compute_cooc_stats_and_filter(options);
        }
        for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
          std::cout << **i << std::endl;
          delete *i;
        }
        options.clear();
        options.push_back(pp);

      } else {
        options.push_back(pp);
      }
      //			  for(int i=0;i<locations.size(); i++){
      //				  cout<<"SentId="<<locations[i].sentIdInCorpus<<" Pos="<<(int)locations[i].posInSentInCorpus<<endl;
      //			  }
    }
  }
  compute_cooc_stats_and_filter(options);
  for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
    std::cout << **i << std::endl;
    delete *i;
  }
  float pfefper = (100.0*(float)nremoved_pfefilter)/(float)pt_lines;
  float sigfper = (100.0*(float)nremoved_sigfilter)/(float)pt_lines;
  std::cerr << "\n\n------------------------------------------------------\n"
            << "  unfiltered phrases pairs: " << pt_lines << "\n"
            << "\n"
            << "     P(f|e) filter [first]: " << nremoved_pfefilter << "   (" << pfefper << "%)\n"
            << "       significance filter: " << nremoved_sigfilter << "   (" << sigfper << "%)\n"
            << "            TOTAL FILTERED: " << (nremoved_pfefilter + nremoved_sigfilter) << "   (" << (sigfper + pfefper) << "%)\n"
            << "\n"
            << "     FILTERED phrase pairs: " << (pt_lines - nremoved_pfefilter - nremoved_sigfilter) << "   (" << (100.0-sigfper - pfefper) << "%)\n"
            << "------------------------------------------------------\n";

  return 0;
}