void matchOptions( std::vector<std::string> vec_options, std::vector<struct Option_Info>& options, std::vector<std::string> buf , vector<struct AnnotationRange> range ,std::map< std::string, double > entropy_dict){ for(int j = 0; j < buf.size(); j++){ for(int i = 0 ; i < vec_options.size(); i++){ int column = buf[j].find( string("\""+vec_options[i]+"\"")); if( column != std::string::npos){ //在配置名外加引号匹配确定长度,减少匹配到多余字符的情况(如AllowOverride匹配到AllowOverrideList) Option_Info op_info; op_info.option_name = vec_options[i]; op_info.line_num = j + 1; op_info.column = column+1; //多加了一个“ \" ”,所以column位置需要+1向后移一位 getVocabulary( vec_options[i].c_str(), op_info.dict); //分词并记录相关信息 struct Word word[MAX_WORD_NUM]; int word_cnt = lex_analyze(buf, j, word, range); // printf("word_cnt = %d\n", word_cnt); for(int j = 0 ; j < word_cnt ; j++){ if(string(word[j].word_name) == vec_options[i] && word[j].line_num == op_info.line_num && word[j].column == op_info.column) word[j].isOptionSelf = true; else word[j].isOptionSelf = false; } // printf("%s\n", vec_options[i].c_str()); // for(int j = 0 ; j < word_cnt ; j++){ // printf("%d %s %d %d %d\n", j , word[j].word_name, word[j].line_num , word[j].column, word[j].isOptionSelf); // } //对单词进行相关性计算 calcRelativity(op_info, word, word_cnt, entropy_dict); //记录有用信息 recordVariableInfo( op_info , word , j , word_cnt); options.push_back(op_info); } } } return ; }
std::string indri::query::TFIDFExpander::expand( std::string originalQuery , std::vector<indri::api::ScoredExtentResult>& results ) { int fbDocs = _param.get( "fbDocs" , 10 ); int fbTerms = _param.get( "fbTerms" , 10 ); double fbOrigWt = _param.get( "fbOrigWeight", 0.5 ); double k1; double b; double k3 = 0; std::string smoothing = _param.get("rule"); // need the tfidf parameters. indri::api::Parameters spec; _parseSmoothing( spec, smoothing ); std::string method = spec.get( "method", "tfidf" ); k1 = spec.get( "k1", 1.2 ); b = spec.get( "b", 0.75 ); if (method == "okapi") { k3 = spec.get( "k3", 7 ); } std::vector<indri::api::DocumentVector*> docVectors = getDocumentVectors( results, fbDocs ); std::vector<std::string> * rm_vocab = getVocabulary( docVectors ); INT64 documentCount = _env->documentCount(); UINT64 colLen = _env->termCount(); double avgDocLength = colLen / double(documentCount); std::map<std::string, TFIDFTerm> query_model; std::map<std::string, TFIDFTerm> orig_model; // initialize all TFIDFTerm structures for( size_t i = 0; i < rm_vocab->size(); i++ ) { TFIDFTerm pterm; pterm.stem = (*rm_vocab)[i]; pterm.relevance = 0; query_model[ pterm.stem ] = pterm; } delete rm_vocab; // need original query term counts to initialize the relevance // for the existing terms const std::vector<indri::server::QueryServer*>& servers = _env->getServers(); lemur::api::TERMID_T id = 0; std::string qTerm; istringstream qTerms(originalQuery); while (qTerms >> qTerm) { // take the first id returned id = 0; for (int i = 0; (id == 0) && (i < servers.size()); i++) { id = servers[i]->termID(qTerm); if (id) qTerm = servers[i]->termName(id); } if (id == 0) continue; if (orig_model.find(qTerm) == orig_model.end() ) { TFIDFTerm pterm; pterm.stem = qTerm; pterm.relevance = 0; orig_model[ pterm.stem ] = pterm; } TFIDFTerm& term = orig_model[ qTerm ]; term.relevance++; // count occurrences } for ( std::map<std::string, TFIDFTerm>::iterator iter = orig_model.begin(); iter != orig_model.end(); iter++ ) { // update the query term weight TFIDFTerm& term = iter->second; if (term.relevance != 0 ) { double queryK1 = 1000; // fixed constant in lemur INT64 documentOccurrences = _env->documentStemCount(term.stem); double idf = log( ( documentCount + 1 ) / ( documentOccurrences + 0.5 ) ); // need to test for okapi here... term.relevance = ( idf * queryK1 * term.relevance ) / ( term.relevance + queryK1 ); } } // gather document vectors / statistics for top fbDocs ranked documents if (fbDocs > results.size()) fbDocs = results.size(); for( size_t doc = 0; (int)doc < fbDocs; doc++ ) { indri::api::DocumentVector * docVec = docVectors[ doc ]; indri::utility::greedy_vector<int> positions = docVec->positions(); const std::vector<std::string>& stems = docVec->stems(); indri::utility::count_iterator<int> iter( positions.begin(), positions.end() ); int docLen = int(positions.size()); // accumulate the term scores for( ; iter != positions.end(); ++iter ) { const std::string& stem = stems[ (*iter).object ]; // update the TFIDFTerm structure with computed probabilities TFIDFTerm& term = query_model[ stem ]; int occurrences = (*iter).count; INT64 documentOccurrences = _env->documentStemCount(term.stem); double idf = log( ( documentCount + 1 ) / ( documentOccurrences + 0.5 ) ); double score = _BM25TF(occurrences, k1, b, docLen, avgDocLength) * idf; //double score = _BM25TF(occurrences, k1, b, docLen, avgDocLength) ; term.relevance += score; } delete docVec; } // shove into a vector and sort terms by TFIDF metric std::vector<TFIDFTerm> sortedModel; std::transform( query_model.begin(), query_model.end(), std::back_inserter( sortedModel ), TFIDFTerm::pair_projection() ); // weight[t] /= fbDocs // weight[t] *= fbPosCoeff (default 0.5) for (int i = 0; i < sortedModel.size(); i++) { sortedModel[i].relevance /= fbDocs; sortedModel[i].relevance *= fbOrigWt; } std::sort( sortedModel.begin(), sortedModel.end(), TFIDFTerm::relevance_greater() ); //update scores for top k term int numAdded = 0; for (int i = 0; numAdded < fbTerms && i < sortedModel.size(); i++) { TFIDFTerm& term = sortedModel[ i ]; if (term.stem == "[OOV]") continue; if (orig_model.find(term.stem) != orig_model.end() ) { orig_model[term.stem].relevance += term.relevance ; } else { orig_model[term.stem] = term; } numAdded++; } sortedModel.clear(); std::transform( orig_model.begin(), orig_model.end(), std::back_inserter( sortedModel ), TFIDFTerm::pair_projection() ); // copy into a vector with only the relevance weights */ std::vector< std::pair<std::string, double> > probabilities; std::transform( sortedModel.begin(), sortedModel.end(), std::back_inserter( probabilities ), TFIDFTerm::relevance_projection() ); // need to add K terms, with some of the original query possibly // remaining in addition. return _buildQuery( fbOrigWt, probabilities ); }