void initial_parse(ReviewParser<std::istream>::sets &funny, ReviewParser<std::istream>::sets &normal, const std::string &prefix, const std::string &conf, const std::string &nope){ auto &funny_reviews = funny.rs; NLTKInstance::Word_Tokenizer wt(nltk); NLTKInstance::Stemmer stemmer(nltk); { parse(funny, prefix, conf); parse(normal, prefix, nope); std::cout << "parsing done" << std::endl; } }
void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const { if ( _textIndexVersion == TEXT_INDEX_VERSION_1 ) { return _scoreDocumentV1( obj, term_freqs ); } FTSElementIterator it( *this, obj ); while ( it.more() ) { FTSIteratorValue val = it.next(); Stemmer stemmer( *val._language ); Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) ); _scoreStringV2( tools, val._text, term_freqs, val._weight ); } }
void FTSSpec::scoreDocument( const BSONObj& obj, const FTSLanguage& parentLanguage, const string& parentPath, bool isArray, TermFrequencyMap* term_freqs ) const { if ( _textIndexVersion == TEXT_INDEX_VERSION_1 ) { dassert( parentPath == "" ); dassert( !isArray ); return _scoreDocumentV1( obj, term_freqs ); } const FTSLanguage& language = _getLanguageToUseV2( obj, parentLanguage ); Stemmer stemmer( language ); Tools tools( language, &stemmer, StopWords::getStopWords( language ) ); // Perform a depth-first traversal of obj, skipping fields not touched by this spec. BSONObjIterator j( obj ); while ( j.more() ) { BSONElement elem = j.next(); string fieldName = elem.fieldName(); // Skip "language" specifier fields if wildcard. if ( wildcard() && languageOverrideField() == fieldName ) { continue; } // Compose the dotted name of the current field: // 1. parent path empty (top level): use the current field name // 2. parent path non-empty and obj is an array: use the parent path // 3. parent path non-empty and obj is a sub-doc: append field name to parent path string dottedName = ( parentPath.empty() ? fieldName : isArray ? parentPath : parentPath + '.' + fieldName ); // Find lower bound of dottedName in _weights. lower_bound leaves us at the first // weight that could possibly match or be a prefix of dottedName. And if this // element fails to match, then no subsequent weight can match, since the weights // are lexicographically ordered. Weights::const_iterator i = _weights.lower_bound( elem.type() == Object ? dottedName + '.' : dottedName ); // possibleWeightMatch is set if the weight map contains either a match or some item // lexicographically larger than fieldName. This boolean acts as a guard on // dereferences of iterator 'i'. bool possibleWeightMatch = ( i != _weights.end() ); // Optimize away two cases, when not wildcard: // 1. lower_bound seeks to end(): no prefix match possible // 2. lower_bound seeks to a name which is not a prefix if ( !wildcard() ) { if ( !possibleWeightMatch ) { continue; } else if ( !_matchPrefix( dottedName, i->first ) ) { continue; } } // Is the current field an exact match on a weight? bool exactMatch = ( possibleWeightMatch && i->first == dottedName ); double weight = ( possibleWeightMatch ? i->second : DEFAULT_WEIGHT ); switch ( elem.type() ) { case String: // Only index strings on exact match or wildcard. if ( exactMatch || wildcard() ) { _scoreStringV2( tools, elem.valuestr(), term_freqs, weight ); } break; case Object: // Only descend into a sub-document on proper prefix or wildcard. Note that // !exactMatch is a sufficient test for proper prefix match, because of // matchPrefix() continue block above. if ( !exactMatch || wildcard() ) { scoreDocument( elem.Obj(), language, dottedName, false, term_freqs ); } break; case Array: // Only descend into arrays from non-array parents or on wildcard. if ( !isArray || wildcard() ) { scoreDocument( elem.Obj(), language, dottedName, true, term_freqs ); } break; default: // Skip over all other BSON types. break; } } }
static TermList * ParseRankedQuery (stemmed_dict * sd, char *QueryLine, int Sort) { u_char Word[MAXSTEMLEN + 1]; u_char *end, *s_in; TermList *Terms = MakeTermList (0); s_in = (u_char *) QueryLine; end = s_in + strlen ((char *) s_in) - 1; /* find the start of the first word */ if (!INAWORD (*s_in)) PARSE_NON_STEM_WORD (s_in, end); while (s_in <= end) { int j; long word_num; unsigned long count, doc_count, invf_ptr, invf_len; /* Get a word and stem it */ PARSE_STEM_WORD (Word, s_in, end); stemmer (sd->sdh.stem_method, Word); /* Skip over the non word separator */ PARSE_NON_STEM_WORD (s_in, end); /* Look for the word in the already identified terms */ for (j = 0; j < Terms->num; j++) if (compare (Terms->TE[j].Word, Word) == 0) break; /* Increment the weight if the word is in the list */ if (j < Terms->num) Terms->TE[j].Count++; else { /* Look for it in the stemmed dictionary */ if ((word_num = FindWord (sd, Word, &count, &doc_count, &invf_ptr, &invf_len)) != -1) { /* Search the list for the word */ for (j = 0; j < Terms->num; j++) if (Terms->TE[j].WE.word_num == word_num) break; /* Increment the weight if the word is in the list */ if (j < Terms->num) Terms->TE[j].Count++; else /* Create a new entry in the list for the new word */ { /* Create a new entry in the list for the new word */ TermEntry te; te.WE.word_num = word_num; te.WE.count = count; te.WE.doc_count = doc_count; te.WE.invf_ptr = invf_ptr; te.WE.invf_len = invf_len; te.Count = 1; te.Word = copy_string (Word); if (!te.Word) FatalError (1, "Could NOT create memory to add term"); AddTermEntry (&Terms, &te); } } } } if (Sort) /* Sort the terms in ascending order by doc_count */ qsort (Terms->TE, Terms->num, sizeof (TermEntry), doc_count_comp); return (Terms); }
int main(int argc, char **argv) { if(argc < 2) { usage(argv); return 1; } try { char *action = argv[1]; char *db_path = argv[2]; if(!strcmp(action, "index")) { Xapian::WritableDatabase db(db_path, Xapian::DB_CREATE_OR_OPEN); Xapian::TermGenerator indexer; Xapian::Stem stemmer("english"); indexer.set_stemmer(stemmer); std::string doc_txt; while(true) { if(std::cin.eof()) break; std::string line; getline(std::cin, line); doc_txt += line; } if(!doc_txt.empty()) { Xapian::Document doc; doc.set_data(doc_txt); indexer.set_document(doc); indexer.index_text(doc_txt); db.add_document(doc); std::cout << "Indexed: " << indexer.get_description() << std::endl; } db.commit(); } else if(!strcmp(action, "search")) { if(argc < 4) { std::cerr << "You must supply a query string" << std::endl; return 1; } Xapian::Database db(db_path); Xapian::Enquire enquire(db); std::string query_str = argv[3]; argv+= 4; while(*argv) { query_str += ' '; query_str += *argv++; } Xapian::QueryParser qp; Xapian::Stem stemmer("english"); qp.set_stemmer(stemmer); qp.set_database(db); qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); Xapian::Query query = qp.parse_query(query_str); std::cout << "Parsed query is: " << query.get_description() << std::endl; enquire.set_query(query); Xapian::MSet matches = enquire.get_mset(0, 10); std::cout << matches.get_matches_estimated() << " results found.\n"; std::cout << "Matches 1-" << matches.size() << ":\n" << std::endl; for (Xapian::MSetIterator i = matches.begin(); i != matches.end(); ++i) { std::cout << i.get_rank() + 1 << ": " << i.get_percent() << "% docid=" << *i << " [" << i.get_document().get_data()<< "]" << std::endl << std::endl; } } else { std::cerr << "Invalid action " << action << std::endl; usage(argv); return 1; } } catch (const Xapian::Error &error) { std::cout << "Exception: " << error.get_msg() << std::endl; } }
int main(int argc, char *argv[]) { FILE *inputfiles,*posting_file; int i=0; char *fileinput,*stemming_file,*posting_filename; if(argc < 2) { printf("\nIncorrect Usage. ./keywordengine <filelist.txt>\n"); return 0; } if((inputfiles=fopen(argv[1],"r+"))==NULL) { printf("\nCould not open %s. Exiting\n",argv[1]); exit(0); } if((posting_file=fopen("../output/posting_list_file_input.txt","w"))==NULL) { printf("\nFatal Error! Could not open/create posting_list_file_input.txt. Check output directory.\nErrorcode : %d\n",errno); exit(0); } int after_stemming=0; int before_stemming=0; double ratio=0.0; double array[20]; for(i=0; i<20; i++) { array[i]=0; } while(!feof(inputfiles)) { fileinput=(char *)malloc(sizeof(char)*FILENAME); fscanf(inputfiles,"%s\n",fileinput); stemming_file=(char *)malloc(sizeof(char)*(strlen(fileinput)+8)); strcpy(stemming_file,"output_"); strcat(stemming_file,fileinput); posting_filename=(char *)malloc(sizeof(char)*(strlen(stemming_file)+6)); strcpy(posting_filename,"stem_"); strcat(posting_filename,stemming_file); fprintf(posting_file,"%s\n",posting_filename); /* Tokenise and remove stopwords */ getwords(fileinput); /* Add to postings list */ initialize(); before_stemming=add_document_to_postingslist(stemming_file); /* Apply Porter's Stemmer */ stemmer(stemming_file); /* Add to postings list */ initialize(); after_stemming=add_document_to_postingslist(posting_filename); ratio=(double)after_stemming/before_stemming; //printf("\nbefore=%d and after=%d Ratio= %lf\n",before_stemming,after_stemming,ratio); if(0 <= ratio && 0.05 > ratio ) array[0]++; else if(0.75 <= ratio && 0.765 > ratio ) array[1]++; else if(0.765 <= ratio && 0.780 > ratio ) array[2]++; else if(0.780 <= ratio && 0.795 > ratio ) array[3]++; else if(0.795 <= ratio && 0.810 > ratio ) array[4]++; else if(0.810 <= ratio && 0.825 > ratio ) array[5]++; else if(0.825 <= ratio && 0.840 > ratio ) array[6]++; else if(0.840 <= ratio && 0.855 > ratio ) array[7]++; else if(0.855 <= ratio && 0.870 > ratio ) array[8]++; else if(0.870 <= ratio && 0.885 > ratio ) array[9]++; else if(0.885 <= ratio && 0.9 > ratio ) array[10]++; else if(0.9 <= ratio && 0.915 > ratio ) array[11]++; else if(0.915 <= ratio && 0.930 > ratio ) array[12]++; else if(0.930 <= ratio && 0.945 > ratio ) array[13]++; else if(0.945 <= ratio && 0.960 > ratio ) array[14]++; else if(0.960 <= ratio && 0.975 > ratio ) array[15]++; else if(0.975 <= ratio && 0.990 > ratio ) array[16]++; else if(0.990 <= ratio && 1.05 > ratio ) array[17]++; else if(1.05 <= ratio && 1.20 > ratio ) array[18]++; else if(1.20 <= ratio && 1.35 >= ratio ) array[19]++; i++; free(fileinput); fileinput=NULL; } fclose(posting_file); fclose(inputfiles); gnuplot_ctrl *h1; h1 = gnuplot_init() ; gnuplot_setstyle(h1, "lines"); gnuplot_set_xlabel(h1, "Compression"); gnuplot_set_ylabel(h1, "Frequency"); gnuplot_plot_x(h1, array ,20, "Ratio Graph") ; getchar(); /*Closing the files*/ gnuplot_close(h1); return 0; }