void initial_parse(ReviewParser<std::istream>::sets &funny, ReviewParser<std::istream>::sets &normal, const std::string &prefix, const std::string &conf, const std::string &nope){
		auto &funny_reviews = funny.rs;
		NLTKInstance::Word_Tokenizer wt(nltk);
		NLTKInstance::Stemmer stemmer(nltk);
		{
			parse(funny, prefix, conf);
			parse(normal, prefix, nope);
			std::cout << "parsing done" << std::endl;
		}
	}
Exemple #2
0
        void FTSSpec::scoreDocument( const BSONObj& obj, TermFrequencyMap* term_freqs ) const {
            if ( _textIndexVersion == TEXT_INDEX_VERSION_1 ) {
                return _scoreDocumentV1( obj, term_freqs );
            }

            FTSElementIterator it( *this, obj );

            while ( it.more() ) {
                FTSIteratorValue val = it.next();
                Stemmer stemmer( *val._language );
                Tools tools( *val._language, &stemmer, StopWords::getStopWords( *val._language ) );
                _scoreStringV2( tools, val._text, term_freqs, val._weight );
            }
        }
Exemple #3
0
void FTSSpec::scoreDocument( const BSONObj& obj,
                             const FTSLanguage& parentLanguage,
                             const string& parentPath,
                             bool isArray,
                             TermFrequencyMap* term_freqs ) const {

    if ( _textIndexVersion == TEXT_INDEX_VERSION_1 ) {
        dassert( parentPath == "" );
        dassert( !isArray );
        return _scoreDocumentV1( obj, term_freqs );
    }

    const FTSLanguage& language = _getLanguageToUseV2( obj, parentLanguage );
    Stemmer stemmer( language );
    Tools tools( language, &stemmer, StopWords::getStopWords( language ) );

    // Perform a depth-first traversal of obj, skipping fields not touched by this spec.
    BSONObjIterator j( obj );
    while ( j.more() ) {

        BSONElement elem = j.next();
        string fieldName = elem.fieldName();

        // Skip "language" specifier fields if wildcard.
        if ( wildcard() && languageOverrideField() == fieldName ) {
            continue;
        }

        // Compose the dotted name of the current field:
        // 1. parent path empty (top level): use the current field name
        // 2. parent path non-empty and obj is an array: use the parent path
        // 3. parent path non-empty and obj is a sub-doc: append field name to parent path
        string dottedName = ( parentPath.empty() ? fieldName
                              : isArray ? parentPath
                              : parentPath + '.' + fieldName );

        // Find lower bound of dottedName in _weights.  lower_bound leaves us at the first
        // weight that could possibly match or be a prefix of dottedName.  And if this
        // element fails to match, then no subsequent weight can match, since the weights
        // are lexicographically ordered.
        Weights::const_iterator i = _weights.lower_bound( elem.type() == Object
                                    ? dottedName + '.'
                                    : dottedName );

        // possibleWeightMatch is set if the weight map contains either a match or some item
        // lexicographically larger than fieldName.  This boolean acts as a guard on
        // dereferences of iterator 'i'.
        bool possibleWeightMatch = ( i != _weights.end() );

        // Optimize away two cases, when not wildcard:
        // 1. lower_bound seeks to end(): no prefix match possible
        // 2. lower_bound seeks to a name which is not a prefix
        if ( !wildcard() ) {
            if ( !possibleWeightMatch ) {
                continue;
            }
            else if ( !_matchPrefix( dottedName, i->first ) ) {
                continue;
            }
        }

        // Is the current field an exact match on a weight?
        bool exactMatch = ( possibleWeightMatch && i->first == dottedName );

        double weight = ( possibleWeightMatch ? i->second : DEFAULT_WEIGHT );

        switch ( elem.type() ) {
        case String:
            // Only index strings on exact match or wildcard.
            if ( exactMatch || wildcard() ) {
                _scoreStringV2( tools, elem.valuestr(), term_freqs, weight );
            }
            break;
        case Object:
            // Only descend into a sub-document on proper prefix or wildcard.  Note that
            // !exactMatch is a sufficient test for proper prefix match, because of
            // matchPrefix() continue block above.
            if ( !exactMatch || wildcard() ) {
                scoreDocument( elem.Obj(), language, dottedName, false, term_freqs );
            }
            break;
        case Array:
            // Only descend into arrays from non-array parents or on wildcard.
            if ( !isArray || wildcard() ) {
                scoreDocument( elem.Obj(), language, dottedName, true, term_freqs );
            }
            break;
        default:
            // Skip over all other BSON types.
            break;
        }
    }
}
Exemple #4
0
static TermList *
ParseRankedQuery (stemmed_dict * sd, char *QueryLine, int Sort)
{
  u_char Word[MAXSTEMLEN + 1];
  u_char *end, *s_in;
  TermList *Terms = MakeTermList (0);
  s_in = (u_char *) QueryLine;
  end = s_in + strlen ((char *) s_in) - 1;


  /* find the start of the first word */
  if (!INAWORD (*s_in))
    PARSE_NON_STEM_WORD (s_in, end);

  while (s_in <= end)
    {
      int j;
      long word_num;
      unsigned long count, doc_count, invf_ptr, invf_len;

      /* Get a word and stem it */
      PARSE_STEM_WORD (Word, s_in, end);
      stemmer (sd->sdh.stem_method, Word);

      /* Skip over the non word separator */
      PARSE_NON_STEM_WORD (s_in, end);

      /* Look for the word in the already identified terms */
      for (j = 0; j < Terms->num; j++)
	if (compare (Terms->TE[j].Word, Word) == 0)
	  break;

      /* Increment the weight if the word is in the list */
      if (j < Terms->num)
	Terms->TE[j].Count++;
      else
	{

	  /* Look for it in the stemmed dictionary */
	  if ((word_num = FindWord (sd, Word, &count, &doc_count,
				    &invf_ptr, &invf_len)) != -1)
	    {
	      /* Search the list for the word */
	      for (j = 0; j < Terms->num; j++)
		if (Terms->TE[j].WE.word_num == word_num)
		  break;

	      /* Increment the weight if the word is in the list */
	      if (j < Terms->num)
		Terms->TE[j].Count++;
	      else
		/* Create a new entry in the list for the new word */
		{
		  /* Create a new entry in the list for the new word */
		  TermEntry te;

		  te.WE.word_num = word_num;
		  te.WE.count = count;
		  te.WE.doc_count = doc_count;
		  te.WE.invf_ptr = invf_ptr;
		  te.WE.invf_len = invf_len;
		  te.Count = 1;
		  te.Word = copy_string (Word);
		  if (!te.Word)
		    FatalError (1, "Could NOT create memory to add term");

		  AddTermEntry (&Terms, &te);
		}
	    }
	}
    }
  if (Sort)
    /* Sort the terms in ascending order by doc_count */
    qsort (Terms->TE, Terms->num, sizeof (TermEntry), doc_count_comp);
  return (Terms);
}
Exemple #5
0
int main(int argc, char **argv)
{
    if(argc < 2) {
        usage(argv);
        return 1;
    }

    try {
        char *action = argv[1];
        char *db_path = argv[2];

        if(!strcmp(action, "index")) {
            Xapian::WritableDatabase db(db_path, Xapian::DB_CREATE_OR_OPEN);

            Xapian::TermGenerator indexer;
            Xapian::Stem stemmer("english");
            indexer.set_stemmer(stemmer);

            std::string doc_txt;
            while(true) {
                if(std::cin.eof()) break;

                std::string line;
                getline(std::cin, line);
                doc_txt += line;
            }

            if(!doc_txt.empty()) {
                Xapian::Document doc;
                doc.set_data(doc_txt);

                indexer.set_document(doc);
                indexer.index_text(doc_txt);

                db.add_document(doc);

                std::cout << "Indexed: " << indexer.get_description() << std::endl;
            }

            db.commit();
        } else if(!strcmp(action, "search")) {
            if(argc < 4) {
                std::cerr << "You must supply a query string" << std::endl;
                return 1;
            }

            Xapian::Database db(db_path);
            Xapian::Enquire enquire(db);

            std::string query_str = argv[3];
            argv+= 4;
            while(*argv) {
                query_str += ' ';
                query_str += *argv++;
            }

            Xapian::QueryParser qp;
            Xapian::Stem stemmer("english");
            qp.set_stemmer(stemmer);
            qp.set_database(db);
            qp.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);

            Xapian::Query query = qp.parse_query(query_str);
            std::cout << "Parsed query is: " << query.get_description() <<
                         std::endl;

            enquire.set_query(query);
            Xapian::MSet matches = enquire.get_mset(0, 10);

            std::cout << matches.get_matches_estimated() << " results found.\n";
            std::cout << "Matches 1-" << matches.size() << ":\n" << std::endl;

            for (Xapian::MSetIterator i = matches.begin();
                    i != matches.end(); ++i) {
                std::cout << i.get_rank() + 1 << ": " << i.get_percent() <<
                        "% docid=" << *i << " [" <<
                        i.get_document().get_data()<< "]" << std::endl <<
                        std::endl;
            }
        } else {
            std::cerr << "Invalid action " << action << std::endl;
            usage(argv);
            return 1;
        }

    } catch (const Xapian::Error &error) {
        std::cout << "Exception: " << error.get_msg() << std::endl;
    }
}
int main(int argc, char *argv[])
{
    FILE *inputfiles,*posting_file;
    int i=0;
    char *fileinput,*stemming_file,*posting_filename;

    if(argc < 2)
    {
        printf("\nIncorrect Usage. ./keywordengine <filelist.txt>\n");
        return 0;
    }

    if((inputfiles=fopen(argv[1],"r+"))==NULL)
    {
        printf("\nCould not open %s. Exiting\n",argv[1]);
        exit(0);
    }

    if((posting_file=fopen("../output/posting_list_file_input.txt","w"))==NULL)
    {
        printf("\nFatal Error! Could not open/create posting_list_file_input.txt. Check output directory.\nErrorcode : %d\n",errno);
        exit(0);
    }

    int after_stemming=0;
    int before_stemming=0;
    double ratio=0.0;

    double array[20];

    for(i=0; i<20; i++)
    {
        array[i]=0;
    }


    while(!feof(inputfiles))
    {
        fileinput=(char *)malloc(sizeof(char)*FILENAME);
        fscanf(inputfiles,"%s\n",fileinput);
        stemming_file=(char *)malloc(sizeof(char)*(strlen(fileinput)+8));

        strcpy(stemming_file,"output_");
        strcat(stemming_file,fileinput);


        posting_filename=(char *)malloc(sizeof(char)*(strlen(stemming_file)+6));
        strcpy(posting_filename,"stem_");
        strcat(posting_filename,stemming_file);
        fprintf(posting_file,"%s\n",posting_filename);


        /* Tokenise and remove stopwords */
        getwords(fileinput);
        /* Add to postings list */
        initialize();
        before_stemming=add_document_to_postingslist(stemming_file);

        /* Apply Porter's Stemmer */
        stemmer(stemming_file);
        /* Add to postings list */
        initialize();
        after_stemming=add_document_to_postingslist(posting_filename);

        ratio=(double)after_stemming/before_stemming;

        //printf("\nbefore=%d and after=%d Ratio= %lf\n",before_stemming,after_stemming,ratio);

        if(0 <= ratio && 0.05 > ratio )
            array[0]++;
        else if(0.75 <= ratio && 0.765 > ratio )
            array[1]++;
        else if(0.765 <= ratio && 0.780 > ratio )
            array[2]++;
        else if(0.780 <= ratio && 0.795 > ratio )
            array[3]++;
        else if(0.795 <= ratio && 0.810 > ratio )
            array[4]++;
        else if(0.810 <= ratio && 0.825 > ratio )
            array[5]++;
        else if(0.825 <= ratio && 0.840 > ratio )
            array[6]++;
        else if(0.840 <= ratio && 0.855 > ratio )
            array[7]++;
        else if(0.855 <= ratio && 0.870 > ratio )
            array[8]++;
        else if(0.870 <= ratio && 0.885 > ratio )
            array[9]++;
        else if(0.885 <= ratio && 0.9 > ratio )
            array[10]++;
        else if(0.9 <= ratio && 0.915 > ratio )
            array[11]++;
        else if(0.915 <= ratio && 0.930 > ratio )
            array[12]++;
        else if(0.930 <= ratio && 0.945 > ratio )
            array[13]++;
        else if(0.945 <= ratio && 0.960 > ratio )
            array[14]++;
        else if(0.960 <= ratio && 0.975 > ratio )
            array[15]++;
        else if(0.975 <= ratio && 0.990 > ratio )
            array[16]++;
        else if(0.990 <= ratio && 1.05 > ratio )
            array[17]++;
        else if(1.05 <= ratio && 1.20 > ratio )
            array[18]++;
        else if(1.20 <= ratio && 1.35 >= ratio )
            array[19]++;

        i++;
        free(fileinput);
        fileinput=NULL;
    }
    fclose(posting_file);
    fclose(inputfiles);


    gnuplot_ctrl *h1;
    h1 = gnuplot_init() ;
    gnuplot_setstyle(h1, "lines");
    gnuplot_set_xlabel(h1, "Compression");
    gnuplot_set_ylabel(h1, "Frequency");

    gnuplot_plot_x(h1, array ,20, "Ratio Graph") ;
    getchar();
    /*Closing the files*/
    gnuplot_close(h1);

    return 0;
}