std::vector<TargetPhrase*> PhraseDictionaryTransliteration::CreateTargetPhrases(const Phrase &sourcePhrase, const string &outDir) const
{
	std::vector<TargetPhrase*> ret;

	string outPath = outDir + "/out.txt";
	ifstream outStream(outPath.c_str());

	string line;
	while (getline(outStream, line)) {
		vector<string> toks;
		Tokenize(toks, line, "\t");
		UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore");

	  TargetPhrase *tp = new TargetPhrase();
	  Word &word = tp->AddWord();
	  word.CreateFromString(Output, m_output, toks[0], false);

	  float score = Scan<float>(toks[1]);
	  tp->GetScoreBreakdown().PlusEquals(this, score);

	  // score of all other ff when this rule is being loaded
	  tp->Evaluate(sourcePhrase, GetFeaturesToApply());

	  ret.push_back(tp);
	}

	outStream.close();

  return ret;
}
Example #2
0
void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase& src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats) const
//fill fs and allStats with statistics from models
{
  for(size_t i = 0; i < m_numModels; ++i) {
    const PhraseDictionary &pd = *m_pd[i];

    TargetPhraseCollection *ret_raw = (TargetPhraseCollection*)  pd.GetTargetPhraseCollection( src);
    if (ret_raw != NULL) {

      TargetPhraseCollection::iterator iterTargetPhrase;
      for (iterTargetPhrase = ret_raw->begin(); iterTargetPhrase != ret_raw->end();  ++iterTargetPhrase) {

        TargetPhrase * targetPhrase = *iterTargetPhrase;
        vector<float> raw_scores = targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd);

        string targetString = targetPhrase->GetStringRep(m_output);
        if (allStats->find(targetString) == allStats->end()) {

          multiModelCountsStatistics * statistics = new multiModelCountsStatistics;
          statistics->targetPhrase = new TargetPhrase(*targetPhrase); //make a copy so that we don't overwrite the original phrase table info

          // zero out scores from original phrase table
          statistics->targetPhrase->GetScoreBreakdown().ZeroDenseFeatures(&pd);

          statistics->fst.resize(m_numModels);
          statistics->ft.resize(m_numModels);
          Scores scoreVector(5);
          scoreVector[0] = -raw_scores[0];
          scoreVector[1] = -raw_scores[1];
          scoreVector[2] = -raw_scores[2];
          statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); // set scores to 0
          statistics->targetPhrase->Evaluate(src, GetFeaturesToApply());

          (*allStats)[targetString] = statistics;

        }
        multiModelCountsStatistics * statistics = (*allStats)[targetString];

        statistics->fst[i] = UntransformScore(raw_scores[0]);
        statistics->ft[i] = UntransformScore(raw_scores[1]);
        fs[i] = UntransformScore(raw_scores[2]);
        (*allStats)[targetString] = statistics;
      }
    }
  }

  // get target phrase frequency for models which have not seen the phrase pair
  for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {
    multiModelCountsStatistics * statistics = iter->second;

    for (size_t i = 0; i < m_numModels; ++i) {
      if (!statistics->ft[i]) {
        statistics->ft[i] = GetTargetCount(static_cast<const Phrase&>(*statistics->targetPhrase), i);
      }
    }
  }
}
Example #3
0
TargetPhrase *SkeletonPT::CreateTargetPhrase(const Phrase &sourcePhrase) const
{
  // create a target phrase from the 1st word of the source, prefix with 'SkeletonPT:'
  CHECK(sourcePhrase.GetSize());
  CHECK(m_output.size() == 1);

  string str = sourcePhrase.GetWord(0).GetFactor(0)->GetString().as_string();
  str = "SkeletonPT:" + str;

  TargetPhrase *tp = new TargetPhrase();
  Word &word = tp->AddWord();
  word.CreateFromString(Output, m_output, str, false);

  // score for this phrase table
  vector<float> scores(m_numScoreComponents, 1.3);
  tp->GetScoreBreakdown().PlusEquals(this, scores);

  // score of all other ff when this rule is being loaded
  tp->Evaluate(sourcePhrase, GetFeaturesToApply());

  return tp;
}
Example #4
0
TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseCollectionCounts(const Phrase &src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats, vector<vector<float> > &multimodelweights) const
{
  TargetPhraseCollection *ret = new TargetPhraseCollection();
  for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {

    multiModelCountsStatistics * statistics = iter->second;

    if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) {
      UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables.");
    }

    try {
      pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm());
      vector< set<size_t> > alignedToT = alignment.first;
      vector< set<size_t> > alignedToS = alignment.second;
      double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], false );
      double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], true );

      Scores scoreVector(5);
      scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0])));
      scoreVector[1] = FloorScore(TransformScore(lexst));
      scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2])));
      scoreVector[3] = FloorScore(TransformScore(lexts));
      scoreVector[4] = FloorScore(TransformScore(2.718));

      statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
      statistics->targetPhrase->Evaluate(src, GetFeaturesToApply());
    } catch (AlignmentException& e) {
      continue;
    }

    ret->Add(new TargetPhrase(*statistics->targetPhrase));
  }

  RemoveAllInMap(*allStats);
  delete allStats;
  return ret;
}
void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence)
{
  char dirName[] = "/tmp/moses.XXXXXX";
  char *temp = mkdtemp(dirName);
  UTIL_THROW_IF2(temp == NULL,
		  "Couldn't create temporary directory " << dirName);

  string dirNameStr(dirName);

  string inFileName(dirNameStr + "/in");

  ofstream inFile(inFileName.c_str());

  for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) {
    inFile << inputSentence.GetWord(i);
  }
  inFile << endl;
  inFile.close();

  long translationId = inputSentence.GetTranslationId();
  string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr);

  // populate with rules for this sentence
  PhraseDictionaryNodeMemory &rootNode = m_collection[translationId];
  FormatType format = MosesFormat;

  // data from file
  InputFileStream inStream(ptFileName);

  // copied from class LoaderStandard
  PrintUserTime("Start loading fuzzy-match phrase model");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();


  string lineOrig;
  size_t count = 0;

  while(getline(inStream, lineOrig)) {
    const string *line;
    if (format == HieroFormat) { // reformat line
      UTIL_THROW(util::Exception, "Cannot be Hiero format");
      //line = ReformatHieroRule(lineOrig);
    } else {
      // do nothing to format of line
      line = &lineOrig;
    }

    vector<string> tokens;
    vector<float> scoreVector;

    TokenizeMultiCharSeparator(tokens, *line , "|||" );

    if (tokens.size() != 4 && tokens.size() != 5) {
      stringstream strme;
      strme << "Syntax error at " << ptFileName << ":" << count;
      UserMessage::Add(strme.str());
      abort();
    }

    const string &sourcePhraseString = tokens[0]
                                       , &targetPhraseString = tokens[1]
                                           , &scoreString        = tokens[2]
                                               , &alignString        = tokens[3];

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    Tokenize<float>(scoreVector, scoreString);
    const size_t numScoreComponents = GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }

    UTIL_THROW_IF2(scoreVector.size() != numScoreComponents,
    		"Number of scores incorrectly specified");

    // parse source & find pt node

    // constituent labels
    Word *sourceLHS;
    Word *targetLHS;

    // source
    Phrase sourcePhrase( 0);
    sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, factorDelimiter, &sourceLHS);

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase();
    targetPhrase->CreateFromString(Output, m_output, targetPhraseString, factorDelimiter, &targetLHS);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString);
    targetPhrase->SetTargetLHS(targetLHS);
    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

    // component score, for n-best output
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);

    targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
    targetPhrase->Evaluate(sourcePhrase, GetFeaturesToApply());

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;

    if (format == HieroFormat) { // reformat line
      delete line;
    } else {
      // do nothing
    }

  }

  // sort and prune each target phrase collection
  SortAndPrune(rootNode);

  //removedirectoryrecursively(dirName);
}