Exemple #1
0
        void FTSSpec::_scoreStringV2( const Tools& tools,
                                      StringData raw,
                                      TermFrequencyMap* docScores,
                                      double weight ) const {

            ScoreHelperMap terms;

            unsigned numTokens = 0;

            Tokenizer i( tools.language, raw );
            while ( i.more() ) {
                Token t = i.next();
                if ( t.type != Token::TEXT )
                    continue;

                string term = t.data.toString();
                makeLower( &term );
                if ( tools.stopwords->isStopWord( term ) ) {
                    continue;
                }
                term = tools.stemmer->stem( term );

                ScoreHelperStruct& data = terms[term];

                if ( data.exp ) {
                    data.exp *= 2;
                }
                else {
                    data.exp = 1;
                }
                data.count += 1;
                data.freq += ( 1 / data.exp );
                numTokens++;
            }

            for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) {

                const string& term = i->first;
                const ScoreHelperStruct& data = i->second;

                // in order to adjust weights as a function of term count as it
                // relates to total field length. ie. is this the only word or
                // a frequently occuring term? or does it only show up once in
                // a long block of text?

                double coeff = ( 0.5 * data.count / numTokens ) + 0.5;

                // if term is identical to the raw form of the
                // field (untokenized) give it a small boost.
                double adjustment = 1;
                if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) )
                    adjustment += 0.1;

                double& score = (*docScores)[term];
                score += ( weight * data.freq * coeff * adjustment );
                verify( score <= MAX_WEIGHT );
            }
        }
Exemple #2
0
void FTSSpec::_scoreStringV2(FTSTokenizer* tokenizer,
                             StringData raw,
                             TermFrequencyMap* docScores,
                             double weight) const {
    ScoreHelperMap terms;

    unsigned numTokens = 0;

    tokenizer->reset(raw.rawData(), FTSTokenizer::kFilterStopWords);

    while (tokenizer->moveNext()) {
        StringData term = tokenizer->get();

        ScoreHelperStruct& data = terms[term];

        if (data.exp) {
            data.exp *= 2;
        } else {
            data.exp = 1;
        }
        data.count += 1;
        data.freq += (1 / data.exp);
        numTokens++;
    }

    for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) {
        const string& term = i->first;
        const ScoreHelperStruct& data = i->second;

        // in order to adjust weights as a function of term count as it
        // relates to total field length. ie. is this the only word or
        // a frequently occuring term? or does it only show up once in
        // a long block of text?

        double coeff = (0.5 * data.count / numTokens) + 0.5;

        // if term is identical to the raw form of the
        // field (untokenized) give it a small boost.
        double adjustment = 1;
        if (raw.size() == term.length() && raw.equalCaseInsensitive(term))
            adjustment += 0.1;

        double& score = (*docScores)[term];
        score += (weight * data.freq * coeff * adjustment);
        verify(score <= MAX_WEIGHT);
    }
}