const TargetPhraseCollection *PhraseDictionaryMultiModel::GetTargetPhraseCollectionLEGACY(const Phrase& src) const
{

  std::vector<std::vector<float> > multimodelweights = getWeights(m_numScoreComponents, true);
  TargetPhraseCollection *ret = NULL;

  std::map<std::string,multiModelStatistics*>* allStats = new(std::map<std::string,multiModelStatistics*>);
  CollectSufficientStatistics(src, allStats);
  ret = CreateTargetPhraseCollectionLinearInterpolation(src, allStats, multimodelweights);
  RemoveAllInMap(*allStats);
  delete allStats;

  ret->NthElement(m_tableLimit); // sort the phrases for pruning later
  const_cast<PhraseDictionaryMultiModel*>(this)->CacheForCleanup(ret);

  return ret;
}
Esempio n. 2
0
const TargetPhraseCollection *PhraseDictionaryMultiModelCounts::GetTargetPhraseCollection(const Phrase& src) const
{
  vector<vector<float> > multimodelweights;
  bool normalize;
  normalize = (m_mode == "interpolate") ? true : false;
  multimodelweights = getWeights(4,normalize);

  //source phrase frequency is shared among all phrase pairs
  vector<float> fs(m_numModels);

  map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);

  CollectSufficientStatistics(src, fs, allStats);

  TargetPhraseCollection *ret = CreateTargetPhraseCollectionCounts(src, fs, allStats, multimodelweights);

  ret->NthElement(m_tableLimit); // sort the phrases for pruning later
  const_cast<PhraseDictionaryMultiModelCounts*>(this)->CacheForCleanup(ret);
  return ret;
}
Esempio n. 3
0
vector<float> PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector)
{

  const StaticData &staticData = StaticData::Instance();
  const string& factorDelimiter = staticData.GetFactorDelimiter();

  map<pair<string, string>, size_t> phrase_pair_map;

  for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
    phrase_pair_map[*iter] += 1;
  }

  vector<multiModelCountsStatisticsOptimization*> optimizerStats;

  for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {

    pair<string, string> phrase_pair = iter->first;
    string source_string = phrase_pair.first;
    string target_string = phrase_pair.second;

    vector<float> fs(m_numModels);
    map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);

    Phrase sourcePhrase(0);
    sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL);

    CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase

    //phrase pair not found; leave cache empty
    if (allStats->find(target_string) == allStats->end()) {
      RemoveAllInMap(*allStats);
      delete allStats;
      continue;
    }

    multiModelCountsStatisticsOptimization * targetStatistics = new multiModelCountsStatisticsOptimization();
    targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
    targetStatistics->fs = fs;
    targetStatistics->fst = (*allStats)[target_string]->fst;
    targetStatistics->ft = (*allStats)[target_string]->ft;
    targetStatistics->f = iter->second;

    try {
      pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), targetStatistics->targetPhrase->GetAlignTerm());
      targetStatistics->lexCachee2f = CacheLexicalStatistics(static_cast<const Phrase&>(*targetStatistics->targetPhrase), sourcePhrase, alignment.second, m_lexTable_e2f, false );
      targetStatistics->lexCachef2e = CacheLexicalStatistics(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), alignment.first, m_lexTable_f2e, true );

      optimizerStats.push_back(targetStatistics);
    } catch (AlignmentException& e) {}

    RemoveAllInMap(*allStats);
    delete allStats;
  }

  Sentence sentence;
  CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables

  vector<float> ret (m_numModels*4);
  for (size_t iFeature=0; iFeature < 4; iFeature++) {

    CrossEntropyCounts * ObjectiveFunction = new CrossEntropyCounts(optimizerStats, this, iFeature);

    vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);

    if (m_mode == "interpolate") {
      weight_vector = normalizeWeights(weight_vector);
    } else if (m_mode == "instance_weighting") {
      float first_value = weight_vector[0];
      for (size_t i=0; i < m_numModels; i++) {
        weight_vector[i] = weight_vector[i]/first_value;
      }
    }
    cerr << "Weight vector for feature " << iFeature << ": ";
    for (size_t i=0; i < m_numModels; i++) {
      ret[(iFeature*m_numModels)+i] = weight_vector[i];
      cerr << weight_vector[i] << " ";
    }
    cerr << endl;
    delete ObjectiveFunction;
  }

  RemoveAllInColl(optimizerStats);
  return ret;

}
vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector)
{

  map<pair<string, string>, size_t> phrase_pair_map;

  for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) {
    phrase_pair_map[*iter] += 1;
  }

  vector<multiModelStatisticsOptimization*> optimizerStats;

  for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) {

    pair<string, string> phrase_pair = iter->first;
    string source_string = phrase_pair.first;
    string target_string = phrase_pair.second;

    vector<float> fs(m_numModels);
    map<string,multiModelStatistics*>* allStats = new(map<string,multiModelStatistics*>);

    Phrase sourcePhrase(0);
    sourcePhrase.CreateFromString(Input, m_input, source_string, NULL);

    CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase

    //phrase pair not found; leave cache empty
    if (allStats->find(target_string) == allStats->end()) {
      RemoveAllInMap(*allStats);
      delete allStats;
      continue;
    }

    multiModelStatisticsOptimization* targetStatistics = new multiModelStatisticsOptimization();
    targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase);
    targetStatistics->p = (*allStats)[target_string]->p;
    targetStatistics->f = iter->second;
    optimizerStats.push_back(targetStatistics);

    RemoveAllInMap(*allStats);
    delete allStats;
  }

  Sentence sentence;
  CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables

  size_t numWeights = m_numScoreComponents;

  vector<float> ret (m_numModels*numWeights);
  for (size_t iFeature=0; iFeature < numWeights; iFeature++) {

    CrossEntropy * ObjectiveFunction = new CrossEntropy(optimizerStats, this, iFeature);

    vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels);

    if (m_mode == "interpolate") {
      weight_vector = normalizeWeights(weight_vector);
    }

    cerr << "Weight vector for feature " << iFeature << ": ";
    for (size_t i=0; i < m_numModels; i++) {
      ret[(iFeature*m_numModels)+i] = weight_vector[i];
      cerr << weight_vector[i] << " ";
    }
    cerr << endl;
    delete ObjectiveFunction;
  }

  RemoveAllInColl(optimizerStats);
  return ret;

}