int main() { for (size_t n = 0; n <= 20; ++n) { for (int i = 0; i < 1000; ++i) { std::string str = random_string(n); suffix_tree S(str); for (const std::string& substr : substrings(str)) { assert(S.find(substr) == find_occurrences(substr, str)); } /* search for random strings in str */ for (size_t m = 0; m <= n; ++m) { std::string rnd_str = random_string(m); assert(S.find(rnd_str) == find_occurrences(rnd_str, str)); } } std::cout << "passed random tests for strings of length " << n << std::endl; } return 0; }
// input: unordered list of translation options for a single source phrase void compute_cooc_stats_and_filter(std::vector<PTEntry*>& options) { if (pfe_filter_limit>0 && options.size() > pfe_filter_limit) { nremoved_pfefilter += (options.size() - pfe_filter_limit); std::nth_element(options.begin(), options.begin()+pfe_filter_limit, options.end(), PfeComparer()); for (std::vector<PTEntry*>::iterator i=options.begin()+pfe_filter_limit; i != options.end(); ++i) delete *i; options.erase(options.begin()+pfe_filter_limit,options.end()); } if (pef_filter_only) return; // std::cerr << "f phrase: " << options.front()->f_phrase << "\n"; SentIdSet fset; fset = find_occurrences(options.front()->f_phrase, f_sa, fsets); size_t cf = fset.size(); for (std::vector<PTEntry*>::iterator i=options.begin(); i != options.end(); ++i) { const std::string& e_phrase = (*i)->e_phrase; size_t cef=0; ClockedSentIdSet& clocked_eset = esets[e_phrase]; SentIdSet & eset = clocked_eset.first; clocked_eset.second = clock(); if (eset.empty()) { eset = find_occurrences(e_phrase, e_sa, esets); //std::cerr << "Looking up e-phrase: " << e_phrase << "\n"; } size_t ce=eset.size(); if (ce < cf) { for (SentIdSet::iterator i=eset.begin(); i != eset.end(); ++i) { if (std::binary_search(fset.begin(), fset.end(), *i)) cef++; } } else { for (SentIdSet::iterator i=fset.begin(); i != fset.end(); ++i) { if (std::binary_search(eset.begin(), eset.end(), *i)) cef++; } } double nlp = -log(fisher_exact(cef, cf, ce)); (*i)->set_cooc_stats(cef, cf, ce, nlp); if (ce < MINIMUM_SIZE_TO_KEEP) { esets.erase(e_phrase); } } std::vector<PTEntry*>::iterator new_end = std::remove_if(options.begin(), options.end(), NlogSigThresholder(sig_filter_limit)); nremoved_sigfilter += (options.end() - new_end); options.erase(new_end,options.end()); }