int main( int argc, char * argv[] ) { if( argc != 3 ){ std::cerr << "Usage: " << argv[ 0 ] << " dictionary_file text_file" << std::endl; return 1; } AhoCorasick ac; auto const print = []( size_t const pos, std::string const & word ){ std::cout << "> " << std::string( pos, ' ' ) << word << std::endl; }; std::string line; std::ifstream dictFile( argv[ 1 ] ); std::ifstream textFile( argv[ 2 ] ); while( std::getline( dictFile, line ) ){ ac.insert( line ); } while( std::getline( textFile, line ) ){ std::cout << "> " << line << "\n"; ac.search( line, print ); std::cout << std::string( 80, '-' ) << "\n"; } }
void loadPattern(const string &filename) { FILE* in = tryOpen(filename, "r"); for (;getLine(in);) { vector<string> tokens = splitBy(line, ','); string pattern = tolower(tokens[0]); int occurrence; fromString(tokens[1], occurrence); patterns[pattern] = occurrence; prob[pattern] = occurrence; quote[pattern] = parenthesis[pattern] = dash[pattern] = capital[pattern] = total[pattern] = 0; size_t tokensN = splitBy(pattern, ' ').size(); f[pattern].resize(tokensN, 0); sumOutside[pattern].resize(tokensN, 0); tree.add(" " + pattern + " "); } fclose(in); //cerr << "# Pattern = " << prob.size() << endl; tree.make(); //cerr << "Tree is built" << endl; }
void eskoTest() { unsigned step = 5000; unsigned start = 30000; unsigned stop = 120000; unsigned topL = 300; unsigned minL = 5; for(unsigned cur = start; cur <= stop; cur += step) { takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 0, cur); //takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 278000, 293000); unsigned L = minL; unsigned R = topL; unsigned ans = 0U; while (L<=R) { unsigned mid = (L+R) / 2U; unsigned k = mid-1U; Genome genome; readGenome("data/tmpseq.seq", mid, genome); AhoCorasick * aho = new AhoCorasick(genome); aho->filterOverlaps(k); NFA_Automata nfa(*aho, genome, 0, genome.generatedReads.size()-1); delete aho; DFA_Automata dfa(nfa, genome, 1.0); // remove reads since we do not need them anymore genome.generatedReads.resize(0); genome.sequence.resize(0); bool uniqueOk = dfa.isCOAUnique(); if (uniqueOk) { R = mid-1U; ans = mid; } else { L = mid+1U; } } std::string seq = readSequence("data/tmpseq.seq"); std::cout << cur << " " << ans << " " << getLongestSingleRepeat(seq) << std::endl; } }
void selftest() { // self test AhoCorasick tree; tree.add("a"); tree.add("aa"); tree.add("ab"); tree.make(); // 0123456789 vector< pair<int, int> > positions; tree.search("aa ab baa", positions); /*FOR (iter, positions) { cerr << iter->first << " " << iter->second << endl; }*/ assert(positions.size() == 8); //cerr << "self test on AC automaton passed" << endl; }
void run(size_t kFilter) { Genome genome; readGenome(sequenceFile, patternLen, genome); size_t numReads = genome.generatedReads.size(); std::cout << genome.generatedReads.size() << " " << (genome.generatedReads[0].second - genome.generatedReads[0].first) << " " << kFilter << std::endl; std::cout << "Longest repeat: " << getLongestSingleRepeat(genome.sequence) << std::endl; double coverage = double(numReads * patternLen) / double(genome.sequence.length()); cout << numReads * patternLen << endl; std::cout << "START AC" << std::endl; AhoCorasick * aho = new AhoCorasick(genome); std::cout << "END AC" << std::endl; std::cout << "START FILTER" << std::endl; aho->filterOverlaps(kFilter); std::cout << "END FILTER" << std::endl; std::cout << "START NFA" << std::endl; NFA_Automata nfa(*aho, genome, 0, genome.generatedReads.size()-1); std::cout << "END NFA" << std::endl; // aho corasick is not needed once we have the nfa FOR NOW delete aho; #ifdef DEBUG nfa.saveAsSVG("data/nfa.svg"); #endif // create DFA from NFA std::cout << "START DFA" << std::endl; DFA_Automata dfa(nfa, genome, coverage); // remove reads since we do not need them anymore genome.generatedReads.resize(0); genome.sequence.resize(0); std::cout << "END DFA" << std::endl; std::cout << "UNIQUE ? " << dfa.isCOAUnique() << std::endl; size_t numLoops = Statistics::getNumberOfLoops(&dfa); std::cout << numLoops << std::endl; #ifdef DEBUG dfa.saveAsSVG("data/dfa.svg"); #endif }
int main() { AhoCorasick ak; ak.AddString("test"); ///initialization ak.AddString("st"); ak.AddString("ring"); ak.AddString("stringed"); ak.Init(); ak.Search("test string", print); getchar(); }
int main(){ ios_base::sync_with_stdio(false); int N; while (cin >> N && N) { ac.reset(); vector<string> words(N); map<string, int> ms; for (int i = 0; i < N; i++) cin >> words[i]; for (int i = 0; i < N; i++) { ac.insert(words[i].c_str(), i + 1); ms[words[i]] = i + 1; } ac.getFail(); string text; cin >> text; ac.find(text.c_str()); int best = *max_element(ac.cnt, ac.cnt + N + 1); cout << best << endl; for (int i = 0; i < N; i++){ if (ac.cnt[ms[words[i]]] == best){ cout << words[i] << endl; } } } }
int main() { AhoCorasick ak; ak.AddString("test"); ///initialization ak.AddString("rok"); /// ak.AddString("sto"); /// ak.AddString("st"); /// ak.Init(); ak.Search("testovaya_stroka", print); }
int main() { AhoCorasick ak; ak.addString("test"); ak.addString("rok"); ak.addString("roka"); ak.addString("strok"); ak.addString("t"); ak.init(); ak.search("testovaya_stroka!", print); cin.get(); return 0; }
int main(int argc, char* argv[]) { if (argc != 6) { cerr << "[usage] <sentencesText.buf> <patterns.csv> <stopwords.txt> <stopwordsFromText.txt> <final.csv>" << endl; return -1; } selftest(); loadSentences(argv[1]); loadPattern(argv[2]); loadStopwords(argv[3], argv[4]); int corpusTokensN = 0; for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) { const string &text = sentences[sentenceID]; string alpha = text; for (size_t i = 0; i < alpha.size(); ++ i) { if (isalpha(alpha[i])) { alpha[i] = tolower(alpha[i]); } else { if (alpha[i] != '\'') { alpha[i] = ' '; } } } corpusTokensN += splitBy(alpha, ' ').size(); string outsideText = alpha; if (sentenceID > 0) { outsideText += " " + sentences[sentenceID - 1]; } if (sentenceID + 1 < sentences.size()) { outsideText += " " + sentences[sentenceID + 1]; } for (size_t i = 0; i < outsideText.size(); ++ i) { if (isalpha(outsideText[i])) { outsideText[i] = tolower(outsideText[i]); } else { outsideText[i] = ' '; } } vector<string> outside = splitBy(outsideText, ' '); unordered_map<string, int> outsideCnt; FOR (token, outside) { ++ outsideCnt[*token]; } vector< pair<int, int> > positions; tree.search(" " + alpha + " ", positions); unordered_map<string, int> patternCnt; FOR (pos, positions) { int st = pos->first; int ed = pos->second - 2; string pattern = alpha.substr(st, ed - st); ++ patternCnt[pattern]; } FOR (pos, positions) { int st = pos->first; int ed = pos->second - 2; string pattern = alpha.substr(st, ed - st); vector<string> tokens = splitBy(pattern, ' '); unordered_map<string, int> tokenCnt; int delta = patternCnt[pattern]; for (size_t i = 0; i < tokens.size(); ++ i) { tokenCnt[tokens[i]] += delta; } for (size_t i = 0; i < tokens.size(); ++ i) { if (outsideCnt[tokens[i]] > tokenCnt[tokens[i]]) { f[pattern][i] += 1; sumOutside[pattern][i] += outsideCnt[tokens[i]] - tokenCnt[tokens[i]]; } } total[pattern] += 1; if (st > 0 && ed < (int)text.size()) { if (text[st - 1] == '(' && text[ed] == ')') { parenthesis[pattern] += 1; } if (text[st - 1] == '"' && text[ed] == '"') { quote[pattern] += 1; } } bool found = false; for (int i = st; i < ed && !found; ++ i) { found |= text[i] == '-'; } dash[pattern] += found; bool valid = true; for (int i = st; i < ed && valid; ++ i) { if (isalpha(alpha[i]) && (i == st || alpha[i - 1] == ' ')) { if (text[i] < 'A' && text[i] > 'Z') { valid = false; } } } capital[pattern] += valid; }
void runBenchmark() { typedef std::pair<size_t, size_t> TestEntry; std::ofstream out("data/benchmark.txt"); int tval = 17; out << std::setw(tval) << std::left << "AC_SZ" << std::setw(tval) << std::left << "AC_TIME" << std::setw(tval) << std::left << "NOA_ST_SZ" << std::setw(tval) << std::left << "NOA_TR_SZ" << std::setw(tval) << std::left << "NOA_TIME" << std::setw(tval) << std::left << "DOA_ST_SZ" << std::setw(tval) << std::left << "DOA_TR_SZ" << std::setw(tval) << std::left << "DOA_PATH_LEN" << std::setw(tval) << std::left << "DOA_CNT_LOOPS" << std::setw(tval) << std::left << "DOA_CNT_BR_NO" << std::setw(tval) << std::left << "DOA_CNT_ACC_NO" << std::setw(tval) << std::left << "DOA_TIME" << std::setw(tval) << std::left << "REF_LEN" << std::setw(tval) << std::left << "NUM_READS" << std::setw(tval) << std::left << "LEN_READS" << std::setw(tval) << std::left << "SAMPLING" << std::endl; // seq length size_t R = 300000; std::vector<TestEntry> Tests = { TestEntry(60, 180), TestEntry(120, 180), }; std::vector<double> Results; Timer timer; double dt; size_t statesCnt; size_t transitionsCnt; for (size_t i = 0; i < Tests.size(); ++i) { // prepare and load test size_t K = Tests[i].first; size_t L = Tests[i].second; takePrefix("data/ecoli.seq", "data/test.seq", R); Genome genome; readGenome("data/test.seq", L, genome); double coverage = double(genome.generatedReads.size() * L) / double(R); TestResult tmpResult; // general stuff tmpResult.REFERENCE_LENGTH = R; tmpResult.NUMBER_OF_READS = genome.generatedReads.size(); tmpResult.SAMPLING = SAMPLING_TYPE::REGULARLY_SPACED; tmpResult.TOTAL_LENGTH_OF_READS = genome.generatedReads.size() * L; std::cout << "START AHO" << std::endl; // AHO timer.start(); AhoCorasick * aho = new AhoCorasick(genome); aho->filterOverlaps(K); timer.stop(); dt = timer.getElapsedTimeInSeconds(); Statistics::calcACStates(aho, statesCnt); tmpResult.AHO_SIZE = statesCnt; tmpResult.AHO_TIME = dt; std::cout << "END AHO" << std::endl; std::cout << "START NFA" << std::endl; timer.start(); NFA_Automata * nfa = new NFA_Automata(*aho, genome, 0, genome.generatedReads.size()-1); timer.stop(); dt = timer.getElapsedTimeInSeconds(); std::cout << "END NFA" << std::endl; // clean not needed data Statistics::calcNOAStatesAndTransitions(nfa, statesCnt, transitionsCnt); tmpResult.NOA_STATES_SIZE = statesCnt; tmpResult.NOA_TRANSITIONS_SIZE = transitionsCnt; tmpResult.NOA_TIME = dt; // clean not needed data delete aho; std::cout << "START DFA" << std::endl; timer.start(); DFA_Automata * dfa = new DFA_Automata(*nfa, genome, coverage); timer.stop(); dt = timer.getElapsedTimeInSeconds(); std::cout << "END DFA" << std::endl; // clean not needed data delete nfa; genome.generatedReads.clear(); genome.sequence.clear(); std::cout << "START STORING REV EDGES" << std::endl; dfa->storeReversedEdges(); std::cout << "END STORING REV EDGES" << std::endl; std::cout << "START UPDATING UNITIG PATHS" << std::endl; dfa->updateUnitigPaths(); std::cout << "STOP UPDATING UNITIG PATHS" << std::endl; Statistics::calcDOAStatesAndTransitions(dfa, statesCnt, transitionsCnt); tmpResult.DOA_STATES_SIZE = statesCnt; tmpResult.DOA_TRANSITIONS_SIZE = transitionsCnt; tmpResult.DOA_TIME = dt; tmpResult.DOA_NUMBER_BRANCHING_NODES = Statistics::getBranchingNodesCount(dfa); tmpResult.DOA_NUMBER_OF_LOOPS = Statistics::getNumberOfLoops(dfa); tmpResult.DOA_SHORTEST_PATH_LEN = Statistics::getShortestPathToAllAcceptStates(dfa); tmpResult.DOA_NUM_ACCEPT_STATES = dfa->m_acceptStates.size(); // clean not needed data delete dfa; std::cout << "DONE" << std::endl; out << tmpResult << std::endl; } out.close(); }
void search_files(program_args &args) { int i; int flags = 0; glob_t results; int ret; for (i = 0; args.source_text_files[i]; i++) { ret = glob(args.source_text_files[i], flags, glob_error, & results); if (ret != 0) { fprintf(stderr, "pmt: problem with %s (%s)\n", args.source_text_files[i], (ret == GLOB_ABORTED ? "filesystem problem" : ret == GLOB_NOMATCH ? "no match of pattern" : ret == GLOB_NOSPACE ? "no dynamic memory" : "unknown problem")); // continues even if it spots a problem } else { for (int i = 0; i < results.gl_pathc; ++i) { // Check if it really is a file if (!is_regular_file(results.gl_pathv[i])) { cout << results.gl_pathv[i] << " isn't a regular file" << endl; } // else { // cout << results.gl_pathv[i] << endl; // } // call search algorithm if (args.allowed_edit_distance) { // approximate search ApproximateSearchStrategy* searchStrategy = new Sellers(args.allowed_edit_distance); vector<Occurrence> result; for (int j = 0; j < args.patterns.size(); j++) { result = searchStrategy->search(args.patterns[j], results.gl_pathv[i]); if (!result.size()) { cout << "No occurrences found." << endl; } for (int k = 0; k < result.size(); k++) { cout << "Occurrence at line " << result[k].lineNumber << ", ending at position " << result[k].position << " with error " << result[k].error << endl; } } delete searchStrategy; } else { // exact search if (args.patterns.size() > 1) { AhoCorasick ahoCorasick; vector<OccurrenceMultiplePatterns> result; result = ahoCorasick.search(args.patterns, results.gl_pathv[i]); if (!result.size()) { cout << "No occurrences found." << endl; } for (int j = 0; j < result.size(); j++) { printf ("%s: Occurrence for pattern %s at line %d starting at position %d \n", results.gl_pathv[i], result[j].value.c_str(), result[j].lineNumber, result[j].position); //cout << "Occurrence for pattern " << result[j].value << // " at line " << result[j].lineNumber << // ", starting at position " << result[j].position << endl; } } else { ExactSearchStrategy* searchStrategy; if (args.kmp_flag) { searchStrategy = new KnuthMorrisPratt(); } else { searchStrategy = new BoyerMoore(); } vector<Occurrence> result; result = searchStrategy->search(args.patterns[0], results.gl_pathv[i]); if (!result.size()) { cout << "No occurrences found." << endl; } for (int k = 0; k < result.size(); k++) { printf ("%s: Occurrence at line %d starting at position %d \n", results.gl_pathv[i],result[k].lineNumber, result[k].position); //cout << "Occurrence at line " << result[k].lineNumber << ", starting at position " << result[k].position << endl; } delete searchStrategy; } } } } } globfree(&results); }
void Test() { //----------------------------------------------------- //Default Test AhoCorasick aho; aho.AddPattern("he"); aho.AddPattern("she"); aho.AddPattern("his"); aho.AddPattern("hers"); aho.Compile(); aho.Print(); AhoCorasickMatch match = aho.SearchText("ushers"); match = aho.SearchText("ushers"); match.PrintMatches(); //----------------------------------------------------- //Homework aho.ClearPatterns(); aho.AddPattern("ccddc"); aho.AddPattern("cdcdc"); aho.AddPattern("cdc"); aho.AddPattern("cd"); aho.Compile(); aho.Print(); match = aho.SearchText("ccdcddcdcdddcdcdcdccdcd"); match.PrintMatches(); }
int main(int argc, char* argv[]) { if(argc == 2) { std::string szArg1 = argv[1]; if(szArg1 == "test") { Test(); return 0; } } if(argc != 5) { Help(argv[0]); return 0; } //Arguments std::string szArg1 = argv[1]; std::string szArg2 = argv[2]; std::string szArg3 = argv[3]; std::string szArg4 = argv[4]; //Streams std::ifstream fileInputStream; std::string szPatternLine; AhoCorasick ahoAutomat; AhoCorasickMatch ahoMatch; if(szArg1 != "-p" && szArg3 != "-t") { Help(argv[0]); return 0; } fileInputStream.open(szArg2.c_str(), std::ios::in); if(fileInputStream.fail()) { std::cout << "Failed at opening file [" << szArg2 << "]" << std::endl; return -1; } //Load Patterns while(std::getline(fileInputStream, szPatternLine)) { if(szPatternLine.length() == 0) continue; std::cout << "[Loading Patterns][" << szPatternLine << "]" << std::endl; ahoAutomat.AddPattern(szPatternLine); } //Compile and Print std::cout << "[Compiling Aho-Corasick State Machine]" << std::endl << std::endl; ahoAutomat.Compile(); ahoAutomat.Print(); //Find Matches and Print std::cout << "[Matching Results]" << std::endl << std::endl; ahoMatch = ahoAutomat.SearchText(szArg4); ahoMatch.PrintMatches(); fileInputStream.close(); return 0; }