const TargetPhraseCollection *PhraseDictionaryMultiModel::GetTargetPhraseCollectionLEGACY(const Phrase& src) const
{

  std::vector<std::vector<float> > multimodelweights = getWeights(m_numScoreComponents, true);
  TargetPhraseCollection *ret = NULL;

  std::map<std::string,multiModelStatistics*>* allStats = new(std::map<std::string,multiModelStatistics*>);
  CollectSufficientStatistics(src, allStats);
  ret = CreateTargetPhraseCollectionLinearInterpolation(src, allStats, multimodelweights);
  RemoveAllInMap(*allStats);
  delete allStats;

  ret->NthElement(m_tableLimit); // sort the phrases for pruning later
  const_cast<PhraseDictionaryMultiModel*>(this)->CacheForCleanup(ret);

  return ret;
}
Exemplo n.º 2
0
TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseCollectionCounts(const Phrase &src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats, vector<vector<float> > &multimodelweights) const
{
  TargetPhraseCollection *ret = new TargetPhraseCollection();
  for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {

    multiModelCountsStatistics * statistics = iter->second;

    if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) {
      UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables.");
    }

    try {
      pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm());
      vector< set<size_t> > alignedToT = alignment.first;
      vector< set<size_t> > alignedToS = alignment.second;
      double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], false );
      double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], true );

      Scores scoreVector(5);
      scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0])));
      scoreVector[1] = FloorScore(TransformScore(lexst));
      scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2])));
      scoreVector[3] = FloorScore(TransformScore(lexts));
      scoreVector[4] = FloorScore(TransformScore(2.718));

      statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
      statistics->targetPhrase->Evaluate(src, GetFeaturesToApply());
    } catch (AlignmentException& e) {
      continue;
    }

    ret->Add(new TargetPhrase(*statistics->targetPhrase));
  }

  RemoveAllInMap(*allStats);
  delete allStats;
  return ret;
}
Exemplo n.º 3
0
vector<float> PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector)
{

  const StaticData &staticData = StaticData::Instance();
  const string& factorDelimiter = staticData.GetFactorDelimiter();

  map<pair<string, string>, size_t> phrase_pair_map;

  for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
    phrase_pair_map[*iter] += 1;
  }

  vector<multiModelCountsStatisticsOptimization*> optimizerStats;

  for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {

    pair<string, string> phrase_pair = iter->first;
    string source_string = phrase_pair.first;
    string target_string = phrase_pair.second;

    vector<float> fs(m_numModels);
    map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);

    Phrase sourcePhrase(0);
    sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);

    CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase

    //phrase pair not found; leave cache empty
    if (allStats->find(target_string) == allStats->end()) {
      RemoveAllInMap(*allStats);
      delete allStats;
      continue;
    }

    multiModelCountsStatisticsOptimization * targetStatistics = new multiModelCountsStatisticsOptimization();
    targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
    targetStatistics->fs = fs;
    targetStatistics->fst = (*allStats)[target_string]->fst;
    targetStatistics->ft = (*allStats)[target_string]->ft;
    targetStatistics->f = iter->second;

    try {
      pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), targetStatistics->targetPhrase->GetAlignTerm());
      targetStatistics->lexCachee2f = CacheLexicalStatistics(static_cast<const Phrase&>(*targetStatistics->targetPhrase), sourcePhrase, alignment.second, m_lexTable_e2f, false );
      targetStatistics->lexCachef2e = CacheLexicalStatistics(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), alignment.first, m_lexTable_f2e, true );

      optimizerStats.push_back(targetStatistics);
    } catch (AlignmentException& e) {}

    RemoveAllInMap(*allStats);
    delete allStats;
  }

  Sentence sentence;
  CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables

  vector<float> ret (m_numModels*4);
  for (size_t iFeature=0; iFeature < 4; iFeature++) {

    CrossEntropyCounts * ObjectiveFunction = new CrossEntropyCounts(optimizerStats, this, iFeature);

    vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);

    if (m_mode == "interpolate") {
      weight_vector = normalizeWeights(weight_vector);
    } else if (m_mode == "instance_weighting") {
      float first_value = weight_vector[0];
      for (size_t i=0; i < m_numModels; i++) {
        weight_vector[i] = weight_vector[i]/first_value;
      }
    }
    cerr << "Weight vector for feature " << iFeature << ": ";
    for (size_t i=0; i < m_numModels; i++) {
      ret[(iFeature*m_numModels)+i] = weight_vector[i];
      cerr << weight_vector[i] << " ";
    }
    cerr << endl;
    delete ObjectiveFunction;
  }

  RemoveAllInColl(optimizerStats);
  return ret;

}
vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector)
{

  map<pair<string, string>, size_t> phrase_pair_map;

  for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
    phrase_pair_map[*iter] += 1;
  }

  vector<multiModelStatisticsOptimization*> optimizerStats;

  for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {

    pair<string, string> phrase_pair = iter->first;
    string source_string = phrase_pair.first;
    string target_string = phrase_pair.second;

    vector<float> fs(m_numModels);
    map<string,multiModelStatistics*>* allStats = new(map<string,multiModelStatistics*>);

    Phrase sourcePhrase(0);
    sourcePhrase.CreateFromString(Input, m_input, source_string, NULL);

    CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase

    //phrase pair not found; leave cache empty
    if (allStats->find(target_string) == allStats->end()) {
      RemoveAllInMap(*allStats);
      delete allStats;
      continue;
    }

    multiModelStatisticsOptimization* targetStatistics = new multiModelStatisticsOptimization();
    targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
    targetStatistics->p = (*allStats)[target_string]->p;
    targetStatistics->f = iter->second;
    optimizerStats.push_back(targetStatistics);

    RemoveAllInMap(*allStats);
    delete allStats;
  }

  Sentence sentence;
  CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables

  size_t numWeights = m_numScoreComponents;

  vector<float> ret (m_numModels*numWeights);
  for (size_t iFeature=0; iFeature < numWeights; iFeature++) {

    CrossEntropy * ObjectiveFunction = new CrossEntropy(optimizerStats, this, iFeature);

    vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);

    if (m_mode == "interpolate") {
      weight_vector = normalizeWeights(weight_vector);
    }

    cerr << "Weight vector for feature " << iFeature << ": ";
    for (size_t i=0; i < m_numModels; i++) {
      ret[(iFeature*m_numModels)+i] = weight_vector[i];
      cerr << weight_vector[i] << " ";
    }
    cerr << endl;
    delete ObjectiveFunction;
  }

  RemoveAllInColl(optimizerStats);
  return ret;

}