int main()
{
	for (size_t n = 0; n <= 20; ++n)
	{
		for (int i = 0; i < 1000; ++i)
		{
			std::string str = random_string(n);

			suffix_tree S(str);

			for (const std::string& substr : substrings(str))
			{
				assert(S.find(substr) == find_occurrences(substr, str));
			}

			/* search for random strings in str */
			for (size_t m = 0; m <= n; ++m)
			{
				std::string rnd_str = random_string(m);
				assert(S.find(rnd_str) == find_occurrences(rnd_str, str));
			}
		}

		std::cout << "passed random tests for strings of length " << n << std::endl;
	}

	return 0;
}
Beispiel #2
0
// input: unordered list of translation options for a single source phrase
void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options)
{
  if (pfe_filter_limit>0 && options.size() > pfe_filter_limit) {
    nremoved_pfefilter += (options.size() - pfe_filter_limit);
    std::nth_element(options.begin(), options.begin()+pfe_filter_limit, options.end(), PfeComparer());
    for (std::vector<PTEntry*>::iterator i=options.begin()+pfe_filter_limit; i != options.end(); ++i)
      delete *i;
    options.erase(options.begin()+pfe_filter_limit,options.end());
  }
  if (pef_filter_only) return;
//   std::cerr << "f phrase: " << options.front()->f_phrase << "\n";
  SentIdSet fset;
  fset = find_occurrences(options.front()->f_phrase, f_sa, fsets);
  size_t cf = fset.size();
  for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) {
    const std::string& e_phrase = (*i)->e_phrase;
    size_t cef=0;
    ClockedSentIdSet& clocked_eset = esets[e_phrase];
    SentIdSet & eset = clocked_eset.first;
    clocked_eset.second = clock();
    if (eset.empty()) {
        eset = find_occurrences(e_phrase, e_sa, esets);
        //std::cerr << "Looking up e-phrase: " << e_phrase << "\n";
    }
    size_t ce=eset.size();
    if (ce < cf) {
      for (SentIdSet::iterator i=eset.begin(); i != eset.end(); ++i) {
        if (std::binary_search(fset.begin(), fset.end(), *i)) cef++;
      }
    } else {
      for (SentIdSet::iterator i=fset.begin(); i != fset.end(); ++i) {
        if (std::binary_search(eset.begin(), eset.end(), *i)) cef++;
      }
    }
    double nlp = -log(fisher_exact(cef, cf, ce));
    (*i)->set_cooc_stats(cef, cf, ce, nlp);
    if (ce < MINIMUM_SIZE_TO_KEEP) {
      esets.erase(e_phrase);
    }

  }
  std::vector<PTEntry*>::iterator new_end =
    std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit));
  nremoved_sigfilter += (options.end() - new_end);
  options.erase(new_end,options.end());
}