示例#1
0
int main( int argc, char * argv[] )
{
    if( argc != 3 ){
        std::cerr << "Usage: " << argv[ 0 ] << " dictionary_file text_file"
            << std::endl;
        return 1;
    }

    AhoCorasick ac;

    auto const print = []( size_t const pos, std::string const & word ){
        std::cout << "> " << std::string( pos, ' ' ) << word << std::endl;
    };

    std::string line;
    std::ifstream dictFile( argv[ 1 ] );
    std::ifstream textFile( argv[ 2 ] );

    while( std::getline( dictFile, line ) ){
        ac.insert( line );
    }

    while( std::getline( textFile, line ) ){
        std::cout << "> " << line << "\n";
        ac.search( line, print );
        std::cout << std::string( 80, '-' ) << "\n";
    }
}
示例#2
0
int main()
{
    AhoCorasick ak;
 
    ak.addString("test");
    ak.addString("rok");
    ak.addString("roka");
    ak.addString("strok");
    ak.addString("t");
 
    ak.init();
 
    ak.search("testovaya_stroka!", print);
 
    cin.get();
 
    return 0;
}
void selftest()
{
    // self test
    AhoCorasick tree;
    tree.add("a");
    tree.add("aa");
    tree.add("ab");
    tree.make();
    //                                                0123456789
    vector< pair<int, int> > positions;
    tree.search("aa ab baa", positions);
    /*FOR (iter, positions) {
        cerr << iter->first << " " << iter->second << endl;
    }*/
    assert(positions.size() == 8);
    
    //cerr << "self test on AC automaton passed" << endl;
}
int main(int argc, char* argv[])
{
    if (argc != 6) {
        cerr << "[usage] <sentencesText.buf> <patterns.csv> <stopwords.txt> <stopwordsFromText.txt> <final.csv>" << endl;
        return -1;
    }
    selftest();
    
    loadSentences(argv[1]);
    loadPattern(argv[2]);
    loadStopwords(argv[3], argv[4]);
    
    int corpusTokensN = 0;
    for (size_t sentenceID = 0; sentenceID < sentences.size(); ++ sentenceID) {
        const string &text = sentences[sentenceID];
        string alpha = text;
        for (size_t i = 0; i < alpha.size(); ++ i) {
            if (isalpha(alpha[i])) {
                alpha[i] = tolower(alpha[i]);
            } else {
                if (alpha[i] != '\'') {
					alpha[i] = ' ';
				}
            }
        }
        corpusTokensN += splitBy(alpha, ' ').size();
        
        string outsideText = alpha;
        if (sentenceID > 0) {
            outsideText += " " + sentences[sentenceID - 1];
        }
        if (sentenceID + 1 < sentences.size()) {
            outsideText += " " + sentences[sentenceID + 1];
        }
        for (size_t i = 0; i < outsideText.size(); ++ i) {
            if (isalpha(outsideText[i])) {
                outsideText[i] = tolower(outsideText[i]);
            } else {
                outsideText[i] = ' ';
            }
        }
        
        vector<string> outside = splitBy(outsideText, ' ');
        unordered_map<string, int> outsideCnt;
        FOR (token, outside) {
            ++ outsideCnt[*token];
        }
        
        vector< pair<int, int> > positions;
        tree.search(" " + alpha + " ", positions);
        
        unordered_map<string, int> patternCnt;
        FOR (pos, positions) {
            int st = pos->first;
            int ed = pos->second - 2;
            string pattern = alpha.substr(st, ed - st);
            ++ patternCnt[pattern];
        }
        FOR (pos, positions) {
            int st = pos->first;
            int ed = pos->second - 2;
            string pattern = alpha.substr(st, ed - st);
            
            vector<string> tokens = splitBy(pattern, ' ');
            unordered_map<string, int> tokenCnt;
            int delta = patternCnt[pattern];
            for (size_t i = 0; i < tokens.size(); ++ i) {
                tokenCnt[tokens[i]] += delta;
            }
            for (size_t i = 0; i < tokens.size(); ++ i) {
                if (outsideCnt[tokens[i]] > tokenCnt[tokens[i]]) {
                    f[pattern][i] += 1;
                    sumOutside[pattern][i] += outsideCnt[tokens[i]] - tokenCnt[tokens[i]];
                }
            }
            
            total[pattern] += 1;
            
            if (st > 0 && ed < (int)text.size()) {
                if (text[st - 1] == '(' && text[ed] == ')') {
                    parenthesis[pattern] += 1;
                }
                if (text[st - 1] == '"' && text[ed] == '"') {
                    quote[pattern] += 1;
                }
            }
            
            bool found = false;
            for (int i = st; i < ed && !found; ++ i) {
                found |= text[i] == '-';
            }
            dash[pattern] += found;
            
            bool valid = true;
            for (int i = st; i < ed && valid; ++ i) {
                if (isalpha(alpha[i]) && (i == st || alpha[i - 1] == ' ')) {
                    if (text[i] < 'A' && text[i] > 'Z') {
                        valid = false;
                    }
                }
            }
            capital[pattern] += valid;
        }
示例#5
0
文件: utils.cpp 项目: peaonunes/pmt
void search_files(program_args &args) {
  int i;
  int flags = 0;
  glob_t results;
  int ret;

  for (i = 0; args.source_text_files[i]; i++) {
    ret = glob(args.source_text_files[i], flags, glob_error, & results);
    if (ret != 0) {
      fprintf(stderr, "pmt: problem with %s (%s)\n",
        args.source_text_files[i],
        (ret == GLOB_ABORTED ? "filesystem problem" :
         ret == GLOB_NOMATCH ? "no match of pattern" :
         ret == GLOB_NOSPACE ? "no dynamic memory" :
         "unknown problem"));
      // continues even if it spots a problem
    } else {
      for (int i = 0; i < results.gl_pathc; ++i) {
        // Check if it really is a file
        if (!is_regular_file(results.gl_pathv[i])) {
          cout << results.gl_pathv[i] << " isn't a regular file" << endl;
        } // else {
        //   cout << results.gl_pathv[i] << endl;
        // }

        // call search algorithm
        if (args.allowed_edit_distance) { // approximate search
          ApproximateSearchStrategy* searchStrategy = new Sellers(args.allowed_edit_distance);
          vector<Occurrence> result;

          for (int j = 0; j < args.patterns.size(); j++) {
            result = searchStrategy->search(args.patterns[j], results.gl_pathv[i]);

            if (!result.size()) {
              cout << "No occurrences found." << endl;
            }

            for (int k = 0; k < result.size(); k++) {
              cout << "Occurrence at line " << result[k].lineNumber <<
                ", ending at position " << result[k].position << " with error " << result[k].error << endl;
            }
          }

          delete searchStrategy;
        } else { // exact search
          if (args.patterns.size() > 1) {
            AhoCorasick ahoCorasick;
            vector<OccurrenceMultiplePatterns> result;

            result = ahoCorasick.search(args.patterns, results.gl_pathv[i]);

            if (!result.size()) {
              cout << "No occurrences found." << endl;
            }

            for (int j = 0; j < result.size(); j++) {
              printf ("%s: Occurrence for pattern %s at line %d starting at position %d \n", results.gl_pathv[i], result[j].value.c_str(), result[j].lineNumber, result[j].position);
              //cout << "Occurrence for pattern " << result[j].value <<
              //  " at line " << result[j].lineNumber <<
              //  ", starting at position " << result[j].position << endl;
            }
          } else {
            ExactSearchStrategy* searchStrategy;

            if (args.kmp_flag) {
              searchStrategy = new KnuthMorrisPratt();
            } else {
              searchStrategy = new BoyerMoore();
            }

            vector<Occurrence> result;

            result = searchStrategy->search(args.patterns[0], results.gl_pathv[i]);

            if (!result.size()) {
              cout << "No occurrences found." << endl;
            }

            for (int k = 0; k < result.size(); k++) {
              printf ("%s: Occurrence at line %d  starting at position %d \n", results.gl_pathv[i],result[k].lineNumber, result[k].position);
              //cout << "Occurrence at line " << result[k].lineNumber << ", starting at position " << result[k].position << endl;
            }

            delete searchStrategy;
          }
        }
      }
    }
  }
  globfree(&results);
}