int main( int argc, char * argv[] ) { if( argc != 3 ){ std::cerr << "Usage: " << argv[ 0 ] << " dictionary_file text_file" << std::endl; return 1; } AhoCorasick ac; auto const print = []( size_t const pos, std::string const & word ){ std::cout << "> " << std::string( pos, ' ' ) << word << std::endl; }; std::string line; std::ifstream dictFile( argv[ 1 ] ); std::ifstream textFile( argv[ 2 ] ); while( std::getline( dictFile, line ) ){ ac.insert( line ); } while( std::getline( textFile, line ) ){ std::cout << "> " << line << "\n"; ac.search( line, print ); std::cout << std::string( 80, '-' ) << "\n"; } }
int main() { AhoCorasick ak; ak.addString("test"); ak.addString("rok"); ak.addString("roka"); ak.addString("strok"); ak.addString("t"); ak.init(); ak.search("testovaya_stroka!", print); cin.get(); return 0; }
void selftest() { // self test AhoCorasick tree; tree.add("a"); tree.add("aa"); tree.add("ab"); tree.make(); // 0123456789 vector< pair<int, int> > positions; tree.search("aa ab baa", positions); /*FOR (iter, positions) { cerr << iter->first << " " << iter->second << endl; }*/ assert(positions.size() == 8); //cerr << "self test on AC automaton passed" << endl; }
int main(int argc, char* argv[]) { if (argc != 6) { cerr << "[usage] <sentencesText.buf> <patterns.csv> <stopwords.txt> <stopwordsFromText.txt> <final.csv>" << endl; return -1; } selftest(); loadSentences(argv[1]); loadPattern(argv[2]); loadStopwords(argv[3], argv[4]); int corpusTokensN = 0; for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) { const string &text = sentences[sentenceID]; string alpha = text; for (size_t i = 0; i < alpha.size(); ++ i) { if (isalpha(alpha[i])) { alpha[i] = tolower(alpha[i]); } else { if (alpha[i] != '\'') { alpha[i] = ' '; } } } corpusTokensN += splitBy(alpha, ' ').size(); string outsideText = alpha; if (sentenceID > 0) { outsideText += " " + sentences[sentenceID - 1]; } if (sentenceID + 1 < sentences.size()) { outsideText += " " + sentences[sentenceID + 1]; } for (size_t i = 0; i < outsideText.size(); ++ i) { if (isalpha(outsideText[i])) { outsideText[i] = tolower(outsideText[i]); } else { outsideText[i] = ' '; } } vector<string> outside = splitBy(outsideText, ' '); unordered_map<string, int> outsideCnt; FOR (token, outside) { ++ outsideCnt[*token]; } vector< pair<int, int> > positions; tree.search(" " + alpha + " ", positions); unordered_map<string, int> patternCnt; FOR (pos, positions) { int st = pos->first; int ed = pos->second - 2; string pattern = alpha.substr(st, ed - st); ++ patternCnt[pattern]; } FOR (pos, positions) { int st = pos->first; int ed = pos->second - 2; string pattern = alpha.substr(st, ed - st); vector<string> tokens = splitBy(pattern, ' '); unordered_map<string, int> tokenCnt; int delta = patternCnt[pattern]; for (size_t i = 0; i < tokens.size(); ++ i) { tokenCnt[tokens[i]] += delta; } for (size_t i = 0; i < tokens.size(); ++ i) { if (outsideCnt[tokens[i]] > tokenCnt[tokens[i]]) { f[pattern][i] += 1; sumOutside[pattern][i] += outsideCnt[tokens[i]] - tokenCnt[tokens[i]]; } } total[pattern] += 1; if (st > 0 && ed < (int)text.size()) { if (text[st - 1] == '(' && text[ed] == ')') { parenthesis[pattern] += 1; } if (text[st - 1] == '"' && text[ed] == '"') { quote[pattern] += 1; } } bool found = false; for (int i = st; i < ed && !found; ++ i) { found |= text[i] == '-'; } dash[pattern] += found; bool valid = true; for (int i = st; i < ed && valid; ++ i) { if (isalpha(alpha[i]) && (i == st || alpha[i - 1] == ' ')) { if (text[i] < 'A' && text[i] > 'Z') { valid = false; } } } capital[pattern] += valid; }
void search_files(program_args &args) { int i; int flags = 0; glob_t results; int ret; for (i = 0; args.source_text_files[i]; i++) { ret = glob(args.source_text_files[i], flags, glob_error, & results); if (ret != 0) { fprintf(stderr, "pmt: problem with %s (%s)\n", args.source_text_files[i], (ret == GLOB_ABORTED ? "filesystem problem" : ret == GLOB_NOMATCH ? "no match of pattern" : ret == GLOB_NOSPACE ? "no dynamic memory" : "unknown problem")); // continues even if it spots a problem } else { for (int i = 0; i < results.gl_pathc; ++i) { // Check if it really is a file if (!is_regular_file(results.gl_pathv[i])) { cout << results.gl_pathv[i] << " isn't a regular file" << endl; } // else { // cout << results.gl_pathv[i] << endl; // } // call search algorithm if (args.allowed_edit_distance) { // approximate search ApproximateSearchStrategy* searchStrategy = new Sellers(args.allowed_edit_distance); vector<Occurrence> result; for (int j = 0; j < args.patterns.size(); j++) { result = searchStrategy->search(args.patterns[j], results.gl_pathv[i]); if (!result.size()) { cout << "No occurrences found." << endl; } for (int k = 0; k < result.size(); k++) { cout << "Occurrence at line " << result[k].lineNumber << ", ending at position " << result[k].position << " with error " << result[k].error << endl; } } delete searchStrategy; } else { // exact search if (args.patterns.size() > 1) { AhoCorasick ahoCorasick; vector<OccurrenceMultiplePatterns> result; result = ahoCorasick.search(args.patterns, results.gl_pathv[i]); if (!result.size()) { cout << "No occurrences found." << endl; } for (int j = 0; j < result.size(); j++) { printf ("%s: Occurrence for pattern %s at line %d starting at position %d \n", results.gl_pathv[i], result[j].value.c_str(), result[j].lineNumber, result[j].position); //cout << "Occurrence for pattern " << result[j].value << // " at line " << result[j].lineNumber << // ", starting at position " << result[j].position << endl; } } else { ExactSearchStrategy* searchStrategy; if (args.kmp_flag) { searchStrategy = new KnuthMorrisPratt(); } else { searchStrategy = new BoyerMoore(); } vector<Occurrence> result; result = searchStrategy->search(args.patterns[0], results.gl_pathv[i]); if (!result.size()) { cout << "No occurrences found." << endl; } for (int k = 0; k < result.size(); k++) { printf ("%s: Occurrence at line %d starting at position %d \n", results.gl_pathv[i],result[k].lineNumber, result[k].position); //cout << "Occurrence at line " << result[k].lineNumber << ", starting at position " << result[k].position << endl; } delete searchStrategy; } } } } } globfree(&results); }