Пример #1
0
int main( int argc, char * argv[] )
{
    if( argc != 3 ){
        std::cerr << "Usage: " << argv[ 0 ] << " dictionary_file text_file"
            << std::endl;
        return 1;
    }

    AhoCorasick ac;

    auto const print = []( size_t const pos, std::string const & word ){
        std::cout << "> " << std::string( pos, ' ' ) << word << std::endl;
    };

    std::string line;
    std::ifstream dictFile( argv[ 1 ] );
    std::ifstream textFile( argv[ 2 ] );

    while( std::getline( dictFile, line ) ){
        ac.insert( line );
    }

    while( std::getline( textFile, line ) ){
        std::cout << "> " << line << "\n";
        ac.search( line, print );
        std::cout << std::string( 80, '-' ) << "\n";
    }
}
Пример #2
0
void loadPattern(const string &filename)
{
	FILE* in = tryOpen(filename, "r");
	for (;getLine(in);) {
		vector<string> tokens = splitBy(line, ',');
		string pattern = tolower(tokens[0]);
		int occurrence;
		fromString(tokens[1], occurrence);
		
		patterns[pattern] = occurrence;
		prob[pattern] = occurrence;
		
		quote[pattern] = parenthesis[pattern] = dash[pattern] = capital[pattern] = total[pattern] = 0;
		
		size_t tokensN = splitBy(pattern, ' ').size();
		f[pattern].resize(tokensN, 0);
		sumOutside[pattern].resize(tokensN, 0);
		
		tree.add(" " + pattern + " ");
	}
	fclose(in);
	//cerr << "# Pattern = " << prob.size() << endl;
	
	tree.make();
	//cerr << "Tree is built" << endl;
}
Пример #3
0
void eskoTest() {
    unsigned step = 5000;
    unsigned start = 30000;
    unsigned stop = 120000;
    unsigned topL = 300;
    unsigned minL = 5;

    for(unsigned cur = start; cur <= stop; cur += step) {
        takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 0, cur);
        //takeSubstring("data/ecoli.seq", "data/tmpseq.seq", 278000, 293000);

        unsigned L = minL;
        unsigned R = topL;
        unsigned ans = 0U;

        while (L<=R) {
            unsigned mid = (L+R) / 2U;
            unsigned k = mid-1U;
            Genome genome;
            readGenome("data/tmpseq.seq", mid, genome);

            AhoCorasick * aho = new AhoCorasick(genome);

            aho->filterOverlaps(k);

            NFA_Automata nfa(*aho, genome, 0, genome.generatedReads.size()-1);

            delete aho;

            DFA_Automata dfa(nfa, genome, 1.0);
            // remove reads since we do not need them anymore
            genome.generatedReads.resize(0);
            genome.sequence.resize(0);

            bool uniqueOk = dfa.isCOAUnique();

            if (uniqueOk) {
                R = mid-1U;
                ans = mid;
            } else {
                L = mid+1U;
            }

        }

        std::string seq = readSequence("data/tmpseq.seq");

        std::cout << cur << " " << ans << " " << getLongestSingleRepeat(seq) << std::endl;



    }

}
Пример #4
0
void selftest()
{
    // self test
    AhoCorasick tree;
    tree.add("a");
    tree.add("aa");
    tree.add("ab");
    tree.make();
    //                                                0123456789
    vector< pair<int, int> > positions;
    tree.search("aa ab baa", positions);
    /*FOR (iter, positions) {
        cerr << iter->first << " " << iter->second << endl;
    }*/
    assert(positions.size() == 8);
    
    //cerr << "self test on AC automaton passed" << endl;
}
Пример #5
0
void run(size_t kFilter) {
    Genome genome;
    readGenome(sequenceFile, patternLen, genome);
    size_t numReads = genome.generatedReads.size();
    std::cout << genome.generatedReads.size() << " " <<
        (genome.generatedReads[0].second - genome.generatedReads[0].first) << " " << kFilter << std::endl;

    std::cout << "Longest repeat: " << getLongestSingleRepeat(genome.sequence) << std::endl;

    double coverage = double(numReads * patternLen) / double(genome.sequence.length());
    cout << numReads * patternLen << endl;
    std::cout << "START AC" << std::endl;
    AhoCorasick * aho = new AhoCorasick(genome);
    std::cout << "END AC" << std::endl;
    std::cout << "START FILTER" << std::endl;
    aho->filterOverlaps(kFilter);
    std::cout << "END FILTER" << std::endl;
    std::cout << "START NFA" << std::endl;
    NFA_Automata nfa(*aho, genome, 0, genome.generatedReads.size()-1);
    std::cout << "END NFA" << std::endl;
    // aho corasick is not needed once we have the nfa FOR NOW
    delete aho;
    #ifdef DEBUG
        nfa.saveAsSVG("data/nfa.svg");
    #endif
    // create DFA from NFA
    std::cout << "START DFA" << std::endl;
    DFA_Automata dfa(nfa, genome, coverage);
    // remove reads since we do not need them anymore
    genome.generatedReads.resize(0);
    genome.sequence.resize(0);

    std::cout << "END DFA" << std::endl;
    std::cout << "UNIQUE ? " << dfa.isCOAUnique() << std::endl;
    size_t numLoops = Statistics::getNumberOfLoops(&dfa);
    std::cout << numLoops << std::endl;




    #ifdef DEBUG
        dfa.saveAsSVG("data/dfa.svg");
    #endif
}
Пример #6
0
int main()
{
    AhoCorasick ak;
    ak.AddString("test");					///initialization
    ak.AddString("st");					
    ak.AddString("ring");					
    ak.AddString("stringed");						
    ak.Init();
    ak.Search("test string", print);
	getchar();
}
Пример #7
0
int main(){
  ios_base::sync_with_stdio(false); 
  int N;
  while (cin >> N && N) {
    ac.reset();
    vector<string> words(N);
    map<string, int> ms;
    for (int i = 0; i < N; i++) cin >> words[i];
    for (int i = 0; i < N; i++) {
      ac.insert(words[i].c_str(), i + 1); 
      ms[words[i]] = i + 1;
    }
    ac.getFail();
    string text; cin >> text;
    ac.find(text.c_str());
    int best = *max_element(ac.cnt, ac.cnt + N + 1);
    cout << best << endl; 
    for (int i = 0; i < N; i++){
      if (ac.cnt[ms[words[i]]] == best){
        cout << words[i] << endl;
      }
    }
  }
}
Пример #8
0
int main()
{
    AhoCorasick ak;
    ak.AddString("test");					///initialization
    ak.AddString("rok");					///
    ak.AddString("sto");					///
    ak.AddString("st");						///
    ak.Init();
    ak.Search("testovaya_stroka", print);
}
Пример #9
0
int main()
{
    AhoCorasick ak;
 
    ak.addString("test");
    ak.addString("rok");
    ak.addString("roka");
    ak.addString("strok");
    ak.addString("t");
 
    ak.init();
 
    ak.search("testovaya_stroka!", print);
 
    cin.get();
 
    return 0;
}
Пример #10
0
int main(int argc, char* argv[])
{
    if (argc != 6) {
        cerr << "[usage] <sentencesText.buf> <patterns.csv> <stopwords.txt> <stopwordsFromText.txt> <final.csv>" << endl;
        return -1;
    }
    selftest();
    
    loadSentences(argv[1]);
    loadPattern(argv[2]);
    loadStopwords(argv[3], argv[4]);
    
    int corpusTokensN = 0;
    for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) {
        const string &text = sentences[sentenceID];
        string alpha = text;
        for (size_t i = 0; i < alpha.size(); ++ i) {
            if (isalpha(alpha[i])) {
                alpha[i] = tolower(alpha[i]);
            } else {
                if (alpha[i] != '\'') {
					alpha[i] = ' ';
				}
            }
        }
        corpusTokensN += splitBy(alpha, ' ').size();
        
        string outsideText = alpha;
        if (sentenceID > 0) {
            outsideText += " " + sentences[sentenceID - 1];
        }
        if (sentenceID + 1 < sentences.size()) {
            outsideText += " " + sentences[sentenceID + 1];
        }
        for (size_t i = 0; i < outsideText.size(); ++ i) {
            if (isalpha(outsideText[i])) {
                outsideText[i] = tolower(outsideText[i]);
            } else {
                outsideText[i] = ' ';
            }
        }
        
        vector<string> outside = splitBy(outsideText, ' ');
        unordered_map<string, int> outsideCnt;
        FOR (token, outside) {
            ++ outsideCnt[*token];
        }
        
        vector< pair<int, int> > positions;
        tree.search(" " + alpha + " ", positions);
        
        unordered_map<string, int> patternCnt;
        FOR (pos, positions) {
            int st = pos->first;
            int ed = pos->second - 2;
            string pattern = alpha.substr(st, ed - st);
            ++ patternCnt[pattern];
        }
        FOR (pos, positions) {
            int st = pos->first;
            int ed = pos->second - 2;
            string pattern = alpha.substr(st, ed - st);
            
            vector<string> tokens = splitBy(pattern, ' ');
            unordered_map<string, int> tokenCnt;
            int delta = patternCnt[pattern];
            for (size_t i = 0; i < tokens.size(); ++ i) {
                tokenCnt[tokens[i]] += delta;
            }
            for (size_t i = 0; i < tokens.size(); ++ i) {
                if (outsideCnt[tokens[i]] > tokenCnt[tokens[i]]) {
                    f[pattern][i] += 1;
                    sumOutside[pattern][i] += outsideCnt[tokens[i]] - tokenCnt[tokens[i]];
                }
            }
            
            total[pattern] += 1;
            
            if (st > 0 && ed < (int)text.size()) {
                if (text[st - 1] == '(' && text[ed] == ')') {
                    parenthesis[pattern] += 1;
                }
                if (text[st - 1] == '"' && text[ed] == '"') {
                    quote[pattern] += 1;
                }
            }
            
            bool found = false;
            for (int i = st; i < ed && !found; ++ i) {
                found |= text[i] == '-';
            }
            dash[pattern] += found;
            
            bool valid = true;
            for (int i = st; i < ed && valid; ++ i) {
                if (isalpha(alpha[i]) && (i == st || alpha[i - 1] == ' ')) {
                    if (text[i] < 'A' && text[i] > 'Z') {
                        valid = false;
                    }
                }
            }
            capital[pattern] += valid;
        }
Пример #11
0
void runBenchmark() {
    typedef std::pair<size_t, size_t> TestEntry;

    std::ofstream out("data/benchmark.txt");
    int tval = 17;
    out << std::setw(tval) << std::left << "AC_SZ" << std::setw(tval) << std::left
        << "AC_TIME" << std::setw(tval) << std::left
        << "NOA_ST_SZ" << std::setw(tval) << std::left
        << "NOA_TR_SZ" << std::setw(tval) << std::left
        << "NOA_TIME" << std::setw(tval) << std::left
        << "DOA_ST_SZ" << std::setw(tval) << std::left
        << "DOA_TR_SZ" << std::setw(tval) << std::left
        << "DOA_PATH_LEN" << std::setw(tval) << std::left
        << "DOA_CNT_LOOPS" << std::setw(tval) << std::left
        << "DOA_CNT_BR_NO" << std::setw(tval) << std::left
        << "DOA_CNT_ACC_NO" << std::setw(tval) << std::left
        << "DOA_TIME" << std::setw(tval) << std::left
        << "REF_LEN" << std::setw(tval) << std::left
        << "NUM_READS" << std::setw(tval) << std::left
        << "LEN_READS" << std::setw(tval) << std::left
        << "SAMPLING" << std::endl;

    // seq length
    size_t R = 300000;

    std::vector<TestEntry> Tests = {
        TestEntry(60, 180),
        TestEntry(120, 180),
    };

    std::vector<double> Results;

    Timer timer;
    double dt;
    size_t statesCnt;
    size_t transitionsCnt;

    for (size_t i = 0; i < Tests.size(); ++i) {
        // prepare and load test
        size_t K = Tests[i].first;
        size_t L = Tests[i].second;
        takePrefix("data/ecoli.seq", "data/test.seq", R);
        Genome genome;
        readGenome("data/test.seq", L, genome);

        double coverage = double(genome.generatedReads.size() * L) / double(R);

        TestResult tmpResult;

        // general stuff
        tmpResult.REFERENCE_LENGTH = R;
        tmpResult.NUMBER_OF_READS = genome.generatedReads.size();
        tmpResult.SAMPLING = SAMPLING_TYPE::REGULARLY_SPACED;
        tmpResult.TOTAL_LENGTH_OF_READS = genome.generatedReads.size() * L;

        std::cout << "START AHO" << std::endl;
        // AHO
        timer.start();
        AhoCorasick * aho = new AhoCorasick(genome);
        aho->filterOverlaps(K);
        timer.stop();
        dt = timer.getElapsedTimeInSeconds();
        Statistics::calcACStates(aho, statesCnt);
        tmpResult.AHO_SIZE = statesCnt;
        tmpResult.AHO_TIME = dt;
        std::cout << "END AHO" << std::endl;

        std::cout << "START NFA" << std::endl;
        timer.start();
        NFA_Automata * nfa = new NFA_Automata(*aho, genome, 0, genome.generatedReads.size()-1);
        timer.stop();
        dt = timer.getElapsedTimeInSeconds();
        std::cout << "END NFA" << std::endl;
        // clean not needed data

        Statistics::calcNOAStatesAndTransitions(nfa, statesCnt, transitionsCnt);
        tmpResult.NOA_STATES_SIZE = statesCnt;
        tmpResult.NOA_TRANSITIONS_SIZE = transitionsCnt;
        tmpResult.NOA_TIME = dt;

        // clean not needed data
        delete aho;

        std::cout << "START DFA" << std::endl;
        timer.start();
        DFA_Automata * dfa = new DFA_Automata(*nfa, genome, coverage);
        timer.stop();
        dt = timer.getElapsedTimeInSeconds();
        std::cout << "END DFA" << std::endl;
        // clean not needed data
        delete nfa;
        genome.generatedReads.clear();
        genome.sequence.clear();

        std::cout << "START STORING REV EDGES" << std::endl;
        dfa->storeReversedEdges();
        std::cout << "END STORING REV EDGES" << std::endl;

        std::cout << "START UPDATING UNITIG PATHS" << std::endl;
        dfa->updateUnitigPaths();
        std::cout << "STOP UPDATING UNITIG PATHS" << std::endl;

        Statistics::calcDOAStatesAndTransitions(dfa, statesCnt, transitionsCnt);
        tmpResult.DOA_STATES_SIZE = statesCnt;
        tmpResult.DOA_TRANSITIONS_SIZE = transitionsCnt;
        tmpResult.DOA_TIME = dt;
        tmpResult.DOA_NUMBER_BRANCHING_NODES = Statistics::getBranchingNodesCount(dfa);
        tmpResult.DOA_NUMBER_OF_LOOPS = Statistics::getNumberOfLoops(dfa);
        tmpResult.DOA_SHORTEST_PATH_LEN = Statistics::getShortestPathToAllAcceptStates(dfa);
        tmpResult.DOA_NUM_ACCEPT_STATES = dfa->m_acceptStates.size();

        // clean not needed data
        delete dfa;

        std::cout << "DONE" << std::endl;
        out << tmpResult << std::endl;
    }
    out.close();
}
Пример #12
0
void search_files(program_args &args) {
  int i;
  int flags = 0;
  glob_t results;
  int ret;

  for (i = 0; args.source_text_files[i]; i++) {
    ret = glob(args.source_text_files[i], flags, glob_error, & results);
    if (ret != 0) {
      fprintf(stderr, "pmt: problem with %s (%s)\n",
        args.source_text_files[i],
        (ret == GLOB_ABORTED ? "filesystem problem" :
         ret == GLOB_NOMATCH ? "no match of pattern" :
         ret == GLOB_NOSPACE ? "no dynamic memory" :
         "unknown problem"));
      // continues even if it spots a problem
    } else {
      for (int i = 0; i < results.gl_pathc; ++i) {
        // Check if it really is a file
        if (!is_regular_file(results.gl_pathv[i])) {
          cout << results.gl_pathv[i] << " isn't a regular file" << endl;
        } // else {
        //   cout << results.gl_pathv[i] << endl;
        // }

        // call search algorithm
        if (args.allowed_edit_distance) { // approximate search
          ApproximateSearchStrategy* searchStrategy = new Sellers(args.allowed_edit_distance);
          vector<Occurrence> result;

          for (int j = 0; j < args.patterns.size(); j++) {
            result = searchStrategy->search(args.patterns[j], results.gl_pathv[i]);

            if (!result.size()) {
              cout << "No occurrences found." << endl;
            }

            for (int k = 0; k < result.size(); k++) {
              cout << "Occurrence at line " << result[k].lineNumber <<
                ", ending at position " << result[k].position << " with error " << result[k].error << endl;
            }
          }

          delete searchStrategy;
        } else { // exact search
          if (args.patterns.size() > 1) {
            AhoCorasick ahoCorasick;
            vector<OccurrenceMultiplePatterns> result;

            result = ahoCorasick.search(args.patterns, results.gl_pathv[i]);

            if (!result.size()) {
              cout << "No occurrences found." << endl;
            }

            for (int j = 0; j < result.size(); j++) {
              printf ("%s: Occurrence for pattern %s at line %d starting at position %d \n", results.gl_pathv[i], result[j].value.c_str(), result[j].lineNumber, result[j].position);
              //cout << "Occurrence for pattern " << result[j].value <<
              //  " at line " << result[j].lineNumber <<
              //  ", starting at position " << result[j].position << endl;
            }
          } else {
            ExactSearchStrategy* searchStrategy;

            if (args.kmp_flag) {
              searchStrategy = new KnuthMorrisPratt();
            } else {
              searchStrategy = new BoyerMoore();
            }

            vector<Occurrence> result;

            result = searchStrategy->search(args.patterns[0], results.gl_pathv[i]);

            if (!result.size()) {
              cout << "No occurrences found." << endl;
            }

            for (int k = 0; k < result.size(); k++) {
              printf ("%s: Occurrence at line %d  starting at position %d \n", results.gl_pathv[i],result[k].lineNumber, result[k].position);
              //cout << "Occurrence at line " << result[k].lineNumber << ", starting at position " << result[k].position << endl;
            }

            delete searchStrategy;
          }
        }
      }
    }
  }
  globfree(&results);
}
Пример #13
0
void Test()
{
	//-----------------------------------------------------
	//Default Test
	AhoCorasick aho;

	aho.AddPattern("he");
	aho.AddPattern("she");
	aho.AddPattern("his");
	aho.AddPattern("hers");

	aho.Compile();
	aho.Print();

	AhoCorasickMatch match = aho.SearchText("ushers");
	match = aho.SearchText("ushers");
	match.PrintMatches();

	//-----------------------------------------------------
	//Homework
	aho.ClearPatterns();

	aho.AddPattern("ccddc");
	aho.AddPattern("cdcdc");
	aho.AddPattern("cdc");
	aho.AddPattern("cd");

	aho.Compile();
	aho.Print();

	match = aho.SearchText("ccdcddcdcdddcdcdcdccdcd");
	match.PrintMatches();
}
Пример #14
0
int main(int argc, char* argv[])
{
	if(argc == 2)
	{
		std::string szArg1 = argv[1];
		if(szArg1 == "test")
		{
			Test();
			return 0;
		}
	}

	if(argc != 5)
	{
		Help(argv[0]);
		return 0;
	}

	//Arguments
	std::string szArg1 = argv[1];
	std::string szArg2 = argv[2];
	std::string szArg3 = argv[3];
	std::string szArg4 = argv[4];

	//Streams
	std::ifstream fileInputStream;
	std::string szPatternLine;

	AhoCorasick ahoAutomat;
	AhoCorasickMatch ahoMatch;

	if(szArg1 != "-p" && szArg3 != "-t") 
	{
		Help(argv[0]);
		return 0;
	}

	fileInputStream.open(szArg2.c_str(), std::ios::in);
	if(fileInputStream.fail())
	{
		std::cout << "Failed at opening file [" << szArg2 << "]" << std::endl;
		return -1;
	}
	
	//Load Patterns
	while(std::getline(fileInputStream, szPatternLine))
	{
		if(szPatternLine.length() == 0)
			continue;

		std::cout << "[Loading Patterns][" << szPatternLine << "]" << std::endl;
		ahoAutomat.AddPattern(szPatternLine);
	}

	//Compile and Print
	std::cout << "[Compiling Aho-Corasick State Machine]" << std::endl << std::endl;
	ahoAutomat.Compile();
	ahoAutomat.Print();

	//Find Matches and Print
	std::cout << "[Matching Results]" << std::endl << std::endl;
	ahoMatch = ahoAutomat.SearchText(szArg4);
	ahoMatch.PrintMatches();

	fileInputStream.close();

	return 0;
}