void FTSSpec::_scoreStringV2( const Tools& tools, StringData raw, TermFrequencyMap* docScores, double weight ) const { ScoreHelperMap terms; unsigned numTokens = 0; Tokenizer i( tools.language, raw ); while ( i.more() ) { Token t = i.next(); if ( t.type != Token::TEXT ) continue; string term = t.data.toString(); makeLower( &term ); if ( tools.stopwords->isStopWord( term ) ) { continue; } term = tools.stemmer->stem( term ); ScoreHelperStruct& data = terms[term]; if ( data.exp ) { data.exp *= 2; } else { data.exp = 1; } data.count += 1; data.freq += ( 1 / data.exp ); numTokens++; } for ( ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i ) { const string& term = i->first; const ScoreHelperStruct& data = i->second; // in order to adjust weights as a function of term count as it // relates to total field length. ie. is this the only word or // a frequently occuring term? or does it only show up once in // a long block of text? double coeff = ( 0.5 * data.count / numTokens ) + 0.5; // if term is identical to the raw form of the // field (untokenized) give it a small boost. double adjustment = 1; if ( raw.size() == term.length() && raw.equalCaseInsensitive( term ) ) adjustment += 0.1; double& score = (*docScores)[term]; score += ( weight * data.freq * coeff * adjustment ); verify( score <= MAX_WEIGHT ); } }
void FTSSpec::_scoreStringV2(FTSTokenizer* tokenizer, StringData raw, TermFrequencyMap* docScores, double weight) const { ScoreHelperMap terms; unsigned numTokens = 0; tokenizer->reset(raw.rawData(), FTSTokenizer::kFilterStopWords); while (tokenizer->moveNext()) { StringData term = tokenizer->get(); ScoreHelperStruct& data = terms[term]; if (data.exp) { data.exp *= 2; } else { data.exp = 1; } data.count += 1; data.freq += (1 / data.exp); numTokens++; } for (ScoreHelperMap::const_iterator i = terms.begin(); i != terms.end(); ++i) { const string& term = i->first; const ScoreHelperStruct& data = i->second; // in order to adjust weights as a function of term count as it // relates to total field length. ie. is this the only word or // a frequently occuring term? or does it only show up once in // a long block of text? double coeff = (0.5 * data.count / numTokens) + 0.5; // if term is identical to the raw form of the // field (untokenized) give it a small boost. double adjustment = 1; if (raw.size() == term.length() && raw.equalCaseInsensitive(term)) adjustment += 0.1; double& score = (*docScores)[term]; score += (weight * data.freq * coeff * adjustment); verify(score <= MAX_WEIGHT); } }