const TargetPhraseCollection*
PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const {
  
  // There is no souch source phrase if source phrase is longer than longest
  // observed source phrase during compilation 
  if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
    return NULL;

  // Retrieve target phrase collection from phrase table
  TargetPhraseVectorPtr decodedPhraseColl
    = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
  
  if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
    TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
    TargetPhraseCollection* phraseColl = new TargetPhraseCollection();
    
    // Score phrases and if possible apply ttable_limit
    TargetPhraseVector::iterator nth =
      (m_tableLimit == 0 || tpv->size() < m_tableLimit) ?
      tpv->end() : tpv->begin() + m_tableLimit;
    std::nth_element(tpv->begin(), nth, tpv->end(), CompareTargetPhrase());
    for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++)
      phraseColl->Add(new TargetPhrase(*it));
    
    // Cache phrase pair for for clean-up or retrieval with PREnc
    const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(sourcePhrase, phraseColl);
    
    return phraseColl;
  }
  else
    return NULL;
  
}
void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) {
  TargetPhraseCollection *tpc = new TargetPhraseCollection();
  tpc->Add(&phrase);
  waste_memory.push_back(tpc);
  StackVec empty;
  Add(*tpc, empty, range);
}
TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollectionLinearInterpolation(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const
{
  TargetPhraseCollection *ret = new TargetPhraseCollection();
  for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {

    multiModelStatistics * statistics = iter->second;

    Scores scoreVector(m_numScoreComponents);

    for(size_t i = 0; i < m_numScoreComponents; ++i) {
      scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0));
    }

    statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);

    //correct future cost estimates and total score
    vector<FeatureFunction*> pd_feature;
    pd_feature.push_back(const_cast<PhraseDictionaryMultiModel*>(this));
    const vector<FeatureFunction*> pd_feature_const(pd_feature);
    statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);

    ret->Add(new TargetPhrase(*statistics->targetPhrase));
  }
  return ret;
}
void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &inputPath) const
{
    const Phrase &sourcePhrase = inputPath.GetPhrase();
    size_t hash = hash_value(sourcePhrase);

    CacheColl &cache = GetCache();

    std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
    iter = cache.find(hash);

    if (iter != cache.end()) {
    	// already in cache
    	const TargetPhraseCollection *tpColl = iter->second.first;
    	inputPath.SetTargetPhrases(*this, tpColl, NULL);
    }
    else {
        // TRANSLITERATE
    	char *ptr = tmpnam(NULL);
    	string inFile(ptr);
    	ptr = tmpnam(NULL);
    	string outDir(ptr);

    	ofstream inStream(inFile.c_str());
    	inStream << sourcePhrase.ToString() << endl;
    	inStream.close();

    	string cmd = m_scriptDir + "/Transliteration/prepare-transliteration-phrase-table.pl" +
    			" --transliteration-model-dir " + m_filePath +
    			" --moses-src-dir " + m_mosesDir +
    			" --external-bin-dir " + m_externalDir +
    			" --input-extension " + m_inputLang +
    			" --output-extension " + m_outputLang +
    			" --oov-file " + inFile +
    			" --out-dir " + outDir;

    	int ret = system(cmd.c_str());
    	UTIL_THROW_IF2(ret != 0, "Transliteration script error");

    	TargetPhraseCollection *tpColl = new TargetPhraseCollection();
    	vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir);
    	vector<TargetPhrase*>::const_iterator iter;
    	for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
    		TargetPhrase *tp = *iter;
    		tpColl->Add(tp);
    	}

    	std::pair<const TargetPhraseCollection*, clock_t> value(tpColl, clock());
    	cache[hash] = value;

    	inputPath.SetTargetPhrases(*this, tpColl, NULL);

    	// clean up temporary files
    	remove(inFile.c_str());

    	cmd = "rm -rf " + outDir;
    	system(cmd.c_str());
    }
}
Exemplo n.º 5
0
void SkeletonPT::GetTargetPhraseCollectionBatch(const InputPathList &phraseDictionaryQueue) const
{
  InputPathList::const_iterator iter;
  for (iter = phraseDictionaryQueue.begin(); iter != phraseDictionaryQueue.end(); ++iter) {
    InputPath &inputPath = **iter;

    TargetPhrase *tp = CreateTargetPhrase(inputPath.GetPhrase());
    TargetPhraseCollection *tpColl = new TargetPhraseCollection();
    tpColl->Add(tp);

    m_allTPColl.push_back(tpColl);
    inputPath.SetTargetPhrases(*this, tpColl, NULL);
  }
}
const TargetPhraseCollection*
PhraseDictionaryDynSuffixArray::
GetTargetPhraseCollectionLEGACY(const Phrase& src) const
{
  typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
  map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics
  m_biSA->GatherCands(src,pstats);

  TargetPhraseCollection *ret = new TargetPhraseCollection();
  BOOST_FOREACH(pstat_entry & e, pstats) {
    TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src, this);
    tp->GetScoreBreakdown().Assign(this,e.second);
    tp->EvaluateInIsolation(src);
    ret->Add(tp);
  }
Exemplo n.º 7
0
std::vector <ChartTranslationOptions*> Sentence::GetXmlChartTranslationOptions() const
{
  const StaticData &staticData = StaticData::Instance();
  std::vector <ChartTranslationOptions*> ret;

  // XML Options
  // this code is a copy of the 1 in Sentence.

  //only fill the vector if we are parsing XML
  if (staticData.GetXmlInputType() != XmlPassThrough ) {
    //TODO: needed to handle exclusive
    //for (size_t i=0; i<GetSize(); i++) {
    //  m_xmlCoverageMap.push_back(false);
    //}

    //iterXMLOpts will be empty for XmlIgnore
    //look at each column
    for(std::vector<XmlOption*>::const_iterator iterXmlOpts = m_xmlOptions.begin();
        iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) {

      const XmlOption &xmlOption = **iterXmlOpts;
      TargetPhrase *targetPhrase = new TargetPhrase(xmlOption.targetPhrase);

      WordsRange *range = new WordsRange(xmlOption.range);
      StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted

      TargetPhraseCollection *tpc = new TargetPhraseCollection;
      tpc->Add(targetPhrase);

      ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
      ret.push_back(transOpt);

      //TODO: needed to handle exclusive
      //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) {
      //  m_xmlCoverageMap[j]=true;
      //}
    }
  }

  return ret;
}
Exemplo n.º 8
0
TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseCollectionCounts(const Phrase &src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats, vector<vector<float> > &multimodelweights) const
{
  TargetPhraseCollection *ret = new TargetPhraseCollection();
  for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {

    multiModelCountsStatistics * statistics = iter->second;

    if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) {
      UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables.");
    }

    try {
      pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm());
      vector< set<size_t> > alignedToT = alignment.first;
      vector< set<size_t> > alignedToS = alignment.second;
      double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], false );
      double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], true );

      Scores scoreVector(5);
      scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0])));
      scoreVector[1] = FloorScore(TransformScore(lexst));
      scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2])));
      scoreVector[3] = FloorScore(TransformScore(lexts));
      scoreVector[4] = FloorScore(TransformScore(2.718));

      statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
      statistics->targetPhrase->Evaluate(src, GetFeaturesToApply());
    } catch (AlignmentException& e) {
      continue;
    }

    ret->Add(new TargetPhrase(*statistics->targetPhrase));
  }

  RemoveAllInMap(*allStats);
  delete allStats;
  return ret;
}
Exemplo n.º 9
0
//! populate this InputType with data from in stream
int TreeInput::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
  const StaticData &staticData = StaticData::Instance();

  string line;
  if (getline(in, line, '\n').eof())
    return 0;
  // remove extra spaces
  //line = Trim(line);

  std::vector<XMLParseOutput> sourceLabels;
  std::vector<XmlOption*> xmlOptionsList;
  ProcessAndStripXMLTags(line, sourceLabels, xmlOptionsList);

  // do words 1st - hack
  stringstream strme;
  strme << line << endl;

  Sentence::Read(strme, factorOrder);

  // size input chart
  size_t sourceSize = GetSize();
  m_sourceChart.resize(sourceSize);

  for (size_t pos = 0; pos < sourceSize; ++pos) {
    m_sourceChart[pos].resize(sourceSize - pos);
  }

  // do source labels
  vector<XMLParseOutput>::const_iterator iterLabel;
  for (iterLabel = sourceLabels.begin(); iterLabel != sourceLabels.end(); ++iterLabel) {
    const XMLParseOutput &labelItem = *iterLabel;
    const WordsRange &range = labelItem.m_range;
    const string &label = labelItem.m_label;
    AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label, factorOrder);
  }

  // default label
  for (size_t startPos = 0; startPos < sourceSize; ++startPos) {
    for (size_t endPos = startPos; endPos < sourceSize; ++endPos) {
      AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder);
    }
  }

  // XML Options

  //only fill the vector if we are parsing XML
  if (staticData.GetXmlInputType() != XmlPassThrough ) {
    //TODO: needed to handle exclusive
    //for (size_t i=0; i<GetSize(); i++) {
    //  m_xmlCoverageMap.push_back(false);
    //}

    //iterXMLOpts will be empty for XmlIgnore
    //look at each column
    for(std::vector<XmlOption*>::const_iterator iterXmlOpts = xmlOptionsList.begin();
        iterXmlOpts != xmlOptionsList.end(); iterXmlOpts++) {

      const XmlOption *xmlOption = *iterXmlOpts;
      TargetPhrase *targetPhrase = new TargetPhrase(xmlOption->targetPhrase);
      *targetPhrase = xmlOption->targetPhrase; // copy everything
      WordsRange *range = new WordsRange(xmlOption->range);
      const StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted

      TargetPhraseCollection *tpc = new TargetPhraseCollection;
      tpc->Add(targetPhrase);

      ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
      m_xmlChartOptionsList.push_back(transOpt);

      //TODO: needed to handle exclusive
      //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) {
      //  m_xmlCoverageMap[j]=true;
      //}

      delete xmlOption;
    }

  }

  return 1;
}
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
                                  , const std::vector<FactorType> &output
                                  , const string &filePath
                                  , const vector<float> &weight
                                  , size_t tableLimit
                                  , const LMList &languageModels
                                  , float weightWP)
{
    const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing();

    const StaticData &staticData = StaticData::Instance();

    m_tableLimit = tableLimit;

    util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL);

    size_t line_num = 0;
    size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
    const std::string& factorDelimiter = staticData.GetFactorDelimiter();

    Phrase sourcePhrase(0);
    std::vector<float> scv;
    scv.reserve(m_numScoreComponent);

    TargetPhraseCollection *preSourceNode = NULL;
    std::string preSourceString;

    while(true) {
        ++line_num;
        StringPiece line;
        try {
            line = inFile.ReadLine();
        } catch (util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||"));
        StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num));
        StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num));
        StringPiece scoreString(GrabOrDie(pipes, filePath, line_num));

        bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t"));
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n");
            continue;
        }

        //target
        std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase());
        targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter);

        scv.clear();
        for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) {
            char *err_ind;
            // Token is always delimited by some form of space.  Also, apparently strtod is portable but strtof isn't.
            scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind)))));
            if (err_ind == token->data()) {
                stringstream strme;
                strme << "Bad number " << token << " on line " << line_num;
                UserMessage::Add(strme.str());
                abort();
            }
        }
        if (scv.size() != m_numScoreComponent) {
            stringstream strme;
            strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
            UserMessage::Add(strme.str());
            abort();
        }



        size_t consumed = 3;
        if (pipes) {
            targetPhrase->SetAlignmentInfo(*pipes++);
            ++consumed;
        }

        ScoreComponentCollection sparse;
        if (pipes) pipes++; //counts
        if (pipes) {
            //sparse features
            SparsePhraseDictionaryFeature* spdf =
                GetFeature()->GetSparsePhraseDictionaryFeature();
            if (spdf) {
                sparse.Assign(spdf,(pipes++)->as_string());
            }
        }


        // scv good to go sir!
        targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels);

        // Check number of entries delimited by ||| agrees across all lines.
        for (; pipes; ++pipes, ++consumed) {}
        if (numElement != consumed) {
            if (numElement == NOT_FOUND) {
                numElement = consumed;
            } else {
                stringstream strme;
                strme << "Syntax error at " << filePath << ":" << line_num;
                UserMessage::Add(strme.str());
                abort();
            }
        }

        //TODO: Would be better to reuse source phrases, but ownership has to be
        //consistent across phrase table implementations
        sourcePhrase.Clear();
        sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter);
        //Now that the source phrase is ready, we give the target phrase a copy
        targetPhrase->SetSourcePhrase(sourcePhrase);
        if (preSourceString == sourcePhraseString && preSourceNode) {
            preSourceNode->Add(targetPhrase.release());
        } else {
            preSourceNode = CreateTargetPhraseCollection(sourcePhrase);
            preSourceNode->Add(targetPhrase.release());
            preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size());
        }
    }

    // sort each target phrase collection
    m_collection.Sort(m_tableLimit);

    /* // TODO ASK OLIVER WHY THIS IS NEEDED
    const_cast<LMList&>(languageModels).CleanUpAfterSentenceProcessing();
    */

    return true;
}
void PhraseDictionaryNewFormat::AddEquivPhrase(TargetPhraseCollection	&targetPhraseColl, TargetPhrase *targetPhrase)
{
	targetPhraseColl.Add(targetPhrase);
}