コード例 #1
0
void matchOptions( std::vector<std::string> vec_options, std::vector<struct Option_Info>& options, std::vector<std::string> buf , vector<struct AnnotationRange> range ,std::map< std::string, double >  entropy_dict){
	for(int j = 0; j < buf.size(); j++){
		for(int i = 0 ; i < vec_options.size();  i++){
			int column = buf[j].find( string("\""+vec_options[i]+"\""));
			if( column != std::string::npos){	//在配置名外加引号匹配确定长度,减少匹配到多余字符的情况(如AllowOverride匹配到AllowOverrideList)
				Option_Info op_info;
				op_info.option_name = vec_options[i];
				op_info.line_num = j + 1;
				op_info.column =  column+1;	//多加了一个“ \" ”,所以column位置需要+1向后移一位
				getVocabulary( vec_options[i].c_str(), op_info.dict);

				//分词并记录相关信息
				struct Word word[MAX_WORD_NUM];
				int word_cnt = lex_analyze(buf, j, word, range);
				// printf("word_cnt = %d\n", word_cnt);
				for(int j = 0 ; j < word_cnt ;  j++){
					if(string(word[j].word_name) == vec_options[i] && 
					    word[j].line_num == op_info.line_num && 
					    word[j].column == op_info.column)
						word[j].isOptionSelf = true;
					else word[j].isOptionSelf = false;
				}

				// printf("%s\n", vec_options[i].c_str());
				// for(int j = 0 ; j < word_cnt ;  j++){
				// 	printf("%d %s %d %d %d\n", j ,  word[j].word_name, word[j].line_num , word[j].column, word[j].isOptionSelf);
				// }

				//对单词进行相关性计算
				calcRelativity(op_info, word, word_cnt, entropy_dict);

				//记录有用信息
				recordVariableInfo( op_info , word , j , word_cnt);

				options.push_back(op_info);
				
			}
		}
	}
	return ;
}
コード例 #2
0
ファイル: TFIDFExpander.cpp プロジェクト: Hosssein/WE
std::string indri::query::TFIDFExpander::expand( std::string originalQuery , std::vector<indri::api::ScoredExtentResult>& results ) {
  int fbDocs = _param.get( "fbDocs" , 10 );
  int fbTerms = _param.get( "fbTerms" , 10 );
  double fbOrigWt = _param.get( "fbOrigWeight", 0.5 );

  double k1;
  double b;
  double k3 = 0;

  std::string smoothing = _param.get("rule"); // need the tfidf parameters.
  indri::api::Parameters spec;
  _parseSmoothing( spec, smoothing );
  std::string method = spec.get( "method", "tfidf" );
  k1 = spec.get( "k1", 1.2 );
  b = spec.get( "b", 0.75 );
  if (method == "okapi") {
    k3 = spec.get( "k3", 7 );
  }

  std::vector<indri::api::DocumentVector*> docVectors = getDocumentVectors( results, fbDocs );
  std::vector<std::string> * rm_vocab = getVocabulary( docVectors );
  INT64 documentCount = _env->documentCount();
  UINT64 colLen = _env->termCount();
  double  avgDocLength = colLen / double(documentCount);

  std::map<std::string, TFIDFTerm> query_model;
  std::map<std::string, TFIDFTerm> orig_model;
  // initialize all TFIDFTerm structures
  for( size_t i = 0; i < rm_vocab->size(); i++ ) {
    TFIDFTerm pterm;
    pterm.stem = (*rm_vocab)[i];
    pterm.relevance = 0;
    query_model[ pterm.stem ] = pterm;
  }
  delete rm_vocab;
  // need original query term counts to initialize the relevance
  // for the existing terms
  const std::vector<indri::server::QueryServer*>& servers = _env->getServers();
  lemur::api::TERMID_T id = 0;
  std::string qTerm;
  istringstream qTerms(originalQuery);
  
  while (qTerms >> qTerm) {
        // take the first id returned
    id = 0;
    for (int i = 0; (id == 0) && (i < servers.size()); i++) {
          id = servers[i]->termID(qTerm);
          if (id) qTerm = servers[i]->termName(id);
    }
    if (id == 0) continue;
    if (orig_model.find(qTerm) == orig_model.end() ) {
      TFIDFTerm pterm;
      pterm.stem = qTerm;
      pterm.relevance = 0;
      orig_model[ pterm.stem ] = pterm;
    }
    TFIDFTerm& term = orig_model[ qTerm ];
    term.relevance++; // count occurrences
  }
  for (  std::map<std::string, TFIDFTerm>::iterator iter = orig_model.begin();
         iter != orig_model.end(); iter++ ) {
      // update the query term weight
       TFIDFTerm& term = iter->second;
       if (term.relevance != 0 ) {
           double queryK1 = 1000; // fixed constant in lemur
           INT64 documentOccurrences = _env->documentStemCount(term.stem);
           double idf = log( ( documentCount + 1 ) / ( documentOccurrences + 0.5 ) );
           // need to test for okapi here...
           term.relevance = ( idf * queryK1 * term.relevance ) / ( term.relevance + queryK1 );
         }
    }
  
  // gather document vectors / statistics for top fbDocs ranked documents
  if (fbDocs > results.size()) fbDocs = results.size();
  for( size_t doc = 0; (int)doc < fbDocs; doc++ ) {
    indri::api::DocumentVector * docVec = docVectors[ doc ];
    indri::utility::greedy_vector<int> positions = docVec->positions();
    const std::vector<std::string>& stems = docVec->stems();
    indri::utility::count_iterator<int> iter( positions.begin(), positions.end() );
    int docLen = int(positions.size());
    // accumulate the term scores
    for( ; iter != positions.end(); ++iter ) {
      const std::string& stem = stems[ (*iter).object ];
      // update the TFIDFTerm structure with computed probabilities
      TFIDFTerm& term = query_model[ stem ];
      int occurrences = (*iter).count;
      INT64 documentOccurrences = _env->documentStemCount(term.stem);
      double idf = log( ( documentCount + 1 ) / ( documentOccurrences + 0.5 ) );
      double score = _BM25TF(occurrences, k1, b, docLen, avgDocLength) * idf;
      //double score = _BM25TF(occurrences, k1, b, docLen, avgDocLength) ;
      term.relevance += score;
    }
    delete docVec;
  }

  // shove into a vector and sort terms by TFIDF metric
  std::vector<TFIDFTerm> sortedModel;
  std::transform( query_model.begin(),
                  query_model.end(),
                  std::back_inserter( sortedModel ),
                  TFIDFTerm::pair_projection() );
  // weight[t] /= fbDocs
  // weight[t] *= fbPosCoeff (default 0.5)
  for (int i = 0; i < sortedModel.size(); i++) {
    sortedModel[i].relevance /= fbDocs;
    sortedModel[i].relevance *= fbOrigWt;
  }
  
  std::sort( sortedModel.begin(), sortedModel.end(), TFIDFTerm::relevance_greater() );

  //update scores for top k term
  int numAdded = 0;
  for (int i = 0; numAdded < fbTerms && i < sortedModel.size(); i++) {
      TFIDFTerm& term = sortedModel[ i ];
      if (term.stem == "[OOV]") continue;
      if (orig_model.find(term.stem) != orig_model.end() ) {
        orig_model[term.stem].relevance += term.relevance ;
      } else {
        orig_model[term.stem] = term;
      }
      numAdded++;
  }
  sortedModel.clear();
  std::transform( orig_model.begin(),
                  orig_model.end(),
                  std::back_inserter( sortedModel ),
                  TFIDFTerm::pair_projection() );
  
  // copy into a vector with only the relevance weights */
  std::vector< std::pair<std::string, double> > probabilities;
                     
  std::transform( sortedModel.begin(),
                  sortedModel.end(),
                  std::back_inserter( probabilities ),
                  TFIDFTerm::relevance_projection() );
  // need to add K terms, with some of the original query possibly
  // remaining in addition.
  return _buildQuery( fbOrigWt, probabilities );
}