TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollectionLinearInterpolation(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const
{
  TargetPhraseCollection *ret = new TargetPhraseCollection();
  for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {

    multiModelStatistics * statistics = iter->second;

    Scores scoreVector(m_numScoreComponents);

    for(size_t i = 0; i < m_numScoreComponents; ++i) {
      scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0));
    }

    statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);

    //correct future cost estimates and total score
    vector<FeatureFunction*> pd_feature;
    pd_feature.push_back(const_cast<PhraseDictionaryMultiModel*>(this));
    const vector<FeatureFunction*> pd_feature_const(pd_feature);
    statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const);

    ret->Add(new TargetPhrase(*statistics->targetPhrase));
  }
  return ret;
}
void ChartTranslationOptionList::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &waste_memory, const WordsRange &range) {
  TargetPhraseCollection *tpc = new TargetPhraseCollection();
  tpc->Add(&phrase);
  waste_memory.push_back(tpc);
  StackVec empty;
  Add(*tpc, empty, range);
}
const TargetPhraseCollection*
PhraseDictionaryCompact::GetTargetPhraseCollection(const Phrase &sourcePhrase) const {
  
  // There is no souch source phrase if source phrase is longer than longest
  // observed source phrase during compilation 
  if(sourcePhrase.GetSize() > m_phraseDecoder->GetMaxSourcePhraseLength())
    return NULL;

  // Retrieve target phrase collection from phrase table
  TargetPhraseVectorPtr decodedPhraseColl
    = m_phraseDecoder->CreateTargetPhraseCollection(sourcePhrase, true);
  
  if(decodedPhraseColl != NULL && decodedPhraseColl->size()) {
    TargetPhraseVectorPtr tpv(new TargetPhraseVector(*decodedPhraseColl));
    TargetPhraseCollection* phraseColl = new TargetPhraseCollection();
    
    // Score phrases and if possible apply ttable_limit
    TargetPhraseVector::iterator nth =
      (m_tableLimit == 0 || tpv->size() < m_tableLimit) ?
      tpv->end() : tpv->begin() + m_tableLimit;
    std::nth_element(tpv->begin(), nth, tpv->end(), CompareTargetPhrase());
    for(TargetPhraseVector::iterator it = tpv->begin(); it != nth; it++)
      phraseColl->Add(new TargetPhrase(*it));
    
    // Cache phrase pair for for clean-up or retrieval with PREnc
    const_cast<PhraseDictionaryCompact*>(this)->CacheForCleanup(sourcePhrase, phraseColl);
    
    return phraseColl;
  }
  else
    return NULL;
  
}
TargetPhraseCollection::TargetPhraseCollection(const TargetPhraseCollection &copy)
{
  for (const_iterator iter = copy.begin(); iter != copy.end(); ++iter) {
    const TargetPhrase &origTP = **iter;
    TargetPhrase *newTP = new TargetPhrase(origTP);
    Add(newTP);
  }
}
void PhraseDictionaryTransliteration::GetTargetPhraseCollection(InputPath &inputPath) const
{
    const Phrase &sourcePhrase = inputPath.GetPhrase();
    size_t hash = hash_value(sourcePhrase);

    CacheColl &cache = GetCache();

    std::map<size_t, std::pair<const TargetPhraseCollection*, clock_t> >::iterator iter;
    iter = cache.find(hash);

    if (iter != cache.end()) {
    	// already in cache
    	const TargetPhraseCollection *tpColl = iter->second.first;
    	inputPath.SetTargetPhrases(*this, tpColl, NULL);
    }
    else {
        // TRANSLITERATE
    	char *ptr = tmpnam(NULL);
    	string inFile(ptr);
    	ptr = tmpnam(NULL);
    	string outDir(ptr);

    	ofstream inStream(inFile.c_str());
    	inStream << sourcePhrase.ToString() << endl;
    	inStream.close();

    	string cmd = m_scriptDir + "/Transliteration/prepare-transliteration-phrase-table.pl" +
    			" --transliteration-model-dir " + m_filePath +
    			" --moses-src-dir " + m_mosesDir +
    			" --external-bin-dir " + m_externalDir +
    			" --input-extension " + m_inputLang +
    			" --output-extension " + m_outputLang +
    			" --oov-file " + inFile +
    			" --out-dir " + outDir;

    	int ret = system(cmd.c_str());
    	UTIL_THROW_IF2(ret != 0, "Transliteration script error");

    	TargetPhraseCollection *tpColl = new TargetPhraseCollection();
    	vector<TargetPhrase*> targetPhrases = CreateTargetPhrases(sourcePhrase, outDir);
    	vector<TargetPhrase*>::const_iterator iter;
    	for (iter = targetPhrases.begin(); iter != targetPhrases.end(); ++iter) {
    		TargetPhrase *tp = *iter;
    		tpColl->Add(tp);
    	}

    	std::pair<const TargetPhraseCollection*, clock_t> value(tpColl, clock());
    	cache[hash] = value;

    	inputPath.SetTargetPhrases(*this, tpColl, NULL);

    	// clean up temporary files
    	remove(inFile.c_str());

    	cmd = "rm -rf " + outDir;
    	system(cmd.c_str());
    }
}
const TargetPhraseCollection *PhraseNode::GetTargetPhraseCollection(size_t tableLimit, OnDiskWrapper &onDiskWrapper) const
{
  TargetPhraseCollection *ret = new TargetPhraseCollection();

  if (m_value > 0)
    ret->ReadFromFile(tableLimit, m_value, onDiskWrapper);
  else {

  }

  return ret;
}
Example #7
0
void SkeletonPT::GetTargetPhraseCollectionBatch(const InputPathList &phraseDictionaryQueue) const
{
  InputPathList::const_iterator iter;
  for (iter = phraseDictionaryQueue.begin(); iter != phraseDictionaryQueue.end(); ++iter) {
    InputPath &inputPath = **iter;

    TargetPhrase *tp = CreateTargetPhrase(inputPath.GetPhrase());
    TargetPhraseCollection *tpColl = new TargetPhraseCollection();
    tpColl->Add(tp);

    m_allTPColl.push_back(tpColl);
    inputPath.SetTargetPhrases(*this, tpColl, NULL);
  }
}
const TargetPhraseCollection*
PhraseDictionaryDynSuffixArray::
GetTargetPhraseCollectionLEGACY(const Phrase& src) const
{
  typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
  map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics
  m_biSA->GatherCands(src,pstats);

  TargetPhraseCollection *ret = new TargetPhraseCollection();
  BOOST_FOREACH(pstat_entry & e, pstats) {
    TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src, this);
    tp->GetScoreBreakdown().Assign(this,e.second);
    tp->EvaluateInIsolation(src);
    ret->Add(tp);
  }
ChartTranslationOptions::ChartTranslationOptions(const TargetPhraseCollection &targetPhraseColl,
    const StackVec &stackVec,
    const WordsRange &wordsRange,
    float score)
  : m_stackVec(stackVec)
  , m_wordsRange(&wordsRange)
  , m_estimateOfBestScore(score)
{
  TargetPhraseCollection::const_iterator iter;
  for (iter = targetPhraseColl.begin(); iter != targetPhraseColl.end(); ++iter) {
    const TargetPhrase *origTP = *iter;

    boost::shared_ptr<ChartTranslationOption> ptr(new ChartTranslationOption(*origTP));
    m_collection.push_back(ptr);
  }
}
const TargetPhraseCollection *PhraseDictionaryMultiModel::GetTargetPhraseCollectionLEGACY(const Phrase& src) const
{

  std::vector<std::vector<float> > multimodelweights = getWeights(m_numScoreComponents, true);
  TargetPhraseCollection *ret = NULL;

  std::map<std::string,multiModelStatistics*>* allStats = new(std::map<std::string,multiModelStatistics*>);
  CollectSufficientStatistics(src, allStats);
  ret = CreateTargetPhraseCollectionLinearInterpolation(src, allStats, multimodelweights);
  RemoveAllInMap(*allStats);
  delete allStats;

  ret->NthElement(m_tableLimit); // sort the phrases for pruning later
  const_cast<PhraseDictionaryMultiModel*>(this)->CacheForCleanup(ret);

  return ret;
}
void ChartRuleCollection::Add(const TargetPhraseCollection &targetPhraseCollection
															, const WordConsumed &wordConsumed
															, bool adhereTableLimit
															, size_t ruleLimit)
{
	TargetPhraseCollection::const_iterator iter, iterEnd;
	iterEnd = (!adhereTableLimit || ruleLimit == 0 || targetPhraseCollection.GetSize() < ruleLimit)
								? targetPhraseCollection.end() : targetPhraseCollection.begin() + ruleLimit;

	for (iter = targetPhraseCollection.begin(); iter != iterEnd; ++iter)
	{
		const TargetPhrase &targetPhrase = **iter;
		float score = targetPhrase.GetFutureScore();

		if (m_collection.size() < ruleLimit)
		{ // not yet filled out quota. add everything
			m_collection.push_back(new ChartRule(targetPhrase, wordConsumed));
			m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold;
		}
		else if (score > m_scoreThreshold)
		{ // full but not bursting. add if better than worst score
			m_collection.push_back(new ChartRule(targetPhrase, wordConsumed));
		}

		// prune if bursting
		if (m_collection.size() > ruleLimit * 2)
		{
			std::nth_element(m_collection.begin()
										, m_collection.begin() + ruleLimit
										, m_collection.end()
										, ChartRuleOrderer());
			// delete the bottom half
			for (size_t ind = ruleLimit; ind < m_collection.size(); ++ind)
			{
				// make the best score of bottom half the score threshold
				const TargetPhrase &targetPhrase = m_collection[ind]->GetTargetPhrase();
				float score = targetPhrase.GetFutureScore();
				m_scoreThreshold = (score > m_scoreThreshold) ? score : m_scoreThreshold;
				delete m_collection[ind];
			}
			m_collection.resize(ruleLimit);
		}

	}
}
Example #12
0
const TargetPhraseCollection *PhraseDictionaryMultiModelCounts::GetTargetPhraseCollection(const Phrase& src) const
{
  vector<vector<float> > multimodelweights;
  bool normalize;
  normalize = (m_mode == "interpolate") ? true : false;
  multimodelweights = getWeights(4,normalize);

  //source phrase frequency is shared among all phrase pairs
  vector<float> fs(m_numModels);

  map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>);

  CollectSufficientStatistics(src, fs, allStats);

  TargetPhraseCollection *ret = CreateTargetPhraseCollectionCounts(src, fs, allStats, multimodelweights);

  ret->NthElement(m_tableLimit); // sort the phrases for pruning later
  const_cast<PhraseDictionaryMultiModelCounts*>(this)->CacheForCleanup(ret);
  return ret;
}
Example #13
0
std::vector <ChartTranslationOptions*> Sentence::GetXmlChartTranslationOptions() const
{
  const StaticData &staticData = StaticData::Instance();
  std::vector <ChartTranslationOptions*> ret;

  // XML Options
  // this code is a copy of the 1 in Sentence.

  //only fill the vector if we are parsing XML
  if (staticData.GetXmlInputType() != XmlPassThrough ) {
    //TODO: needed to handle exclusive
    //for (size_t i=0; i<GetSize(); i++) {
    //  m_xmlCoverageMap.push_back(false);
    //}

    //iterXMLOpts will be empty for XmlIgnore
    //look at each column
    for(std::vector<XmlOption*>::const_iterator iterXmlOpts = m_xmlOptions.begin();
        iterXmlOpts != m_xmlOptions.end(); iterXmlOpts++) {

      const XmlOption &xmlOption = **iterXmlOpts;
      TargetPhrase *targetPhrase = new TargetPhrase(xmlOption.targetPhrase);

      WordsRange *range = new WordsRange(xmlOption.range);
      StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted

      TargetPhraseCollection *tpc = new TargetPhraseCollection;
      tpc->Add(targetPhrase);

      ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
      ret.push_back(transOpt);

      //TODO: needed to handle exclusive
      //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) {
      //  m_xmlCoverageMap[j]=true;
      //}
    }
  }

  return ret;
}
Example #14
0
template <class Model> void Fill<Model>::Add(const TargetPhraseCollection &targets, const StackVec &nts, const WordsRange &)
{
  std::vector<search::PartialVertex> vertices;
  vertices.reserve(nts.size());
  float below_score = 0.0;
  for (StackVec::const_iterator i(nts.begin()); i != nts.end(); ++i) {
    vertices.push_back((*i)->GetStack().incr->RootAlternate());
    if (vertices.back().Empty()) return;
    below_score += vertices.back().Bound();
  }

  std::vector<lm::WordIndex> words;
  for (TargetPhraseCollection::const_iterator p(targets.begin()); p != targets.end(); ++p) {
    words.clear();
    const TargetPhrase &phrase = **p;
    const AlignmentInfo::NonTermIndexMap &align = phrase.GetAlignNonTerm().GetNonTermIndexMap();
    search::PartialEdge edge(edges_.AllocateEdge(nts.size()));

    search::PartialVertex *nt = edge.NT();
    for (size_t i = 0; i < phrase.GetSize(); ++i) {
      const Word &word = phrase.GetWord(i);
      if (word.IsNonTerminal()) {
        *(nt++) = vertices[align[i]];
        words.push_back(search::kNonTerminal);
      } else {
        words.push_back(Convert(word));
      }
    }

    edge.SetScore(phrase.GetFutureScore() + below_score);
    // prob and oov were already accounted for.
    search::ScoreRule(context_.LanguageModel(), words, edge.Between());

    search::Note note;
    note.vp = &phrase;
    edge.SetNote(note);

    edges_.AddEdge(edge);
  }
}
void ChartTranslationOptionList::Add(const TargetPhraseCollection &tpc,
                                     const StackVec &stackVec,
                                     const WordsRange &range)
{
  if (tpc.IsEmpty()) {
    return;
  }

  for (size_t i = 0; i < stackVec.size(); ++i) {
    const ChartCellLabel &chartCellLabel = *stackVec[i];
    size_t numHypos = chartCellLabel.GetStack().cube->size();
    if (numHypos == 0) {
      return; // empty stack. These rules can't be used
    }
  }

  float score = ChartTranslationOptions::CalcEstimateOfBestScore(tpc, stackVec);

  // If the rule limit has already been reached then don't add the option
  // unless it is better than at least one existing option.
  if (m_size > m_ruleLimit && score < m_scoreThreshold) {
    return;
  }

  // Add the option to the list.
  if (m_size == m_collection.size()) {
    // m_collection has reached capacity: create a new object.
    m_collection.push_back(new ChartTranslationOptions(tpc, stackVec,
                           range, score));
  } else {
    // Overwrite an unused object.
    *(m_collection[m_size]) = ChartTranslationOptions(tpc, stackVec,
                              range, score);
  }
  ++m_size;

  // If the rule limit hasn't been exceeded then update the threshold.
  if (m_size <= m_ruleLimit) {
    m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold;
  }

  // Prune if bursting
  if (m_size == m_ruleLimit * 2) {
	NTH_ELEMENT4(m_collection.begin(),
                     m_collection.begin() + m_ruleLimit - 1,
                     m_collection.begin() + m_size,
                     ChartTranslationOptionOrderer());
    m_scoreThreshold = m_collection[m_ruleLimit-1]->GetEstimateOfBestScore();
    m_size = m_ruleLimit;
  }
}
Example #16
0
TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseCollectionCounts(const Phrase &src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats, vector<vector<float> > &multimodelweights) const
{
  TargetPhraseCollection *ret = new TargetPhraseCollection();
  for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {

    multiModelCountsStatistics * statistics = iter->second;

    if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) {
      UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables.");
    }

    try {
      pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm());
      vector< set<size_t> > alignedToT = alignment.first;
      vector< set<size_t> > alignedToS = alignment.second;
      double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], false );
      double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], true );

      Scores scoreVector(5);
      scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0])));
      scoreVector[1] = FloorScore(TransformScore(lexst));
      scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2])));
      scoreVector[3] = FloorScore(TransformScore(lexts));
      scoreVector[4] = FloorScore(TransformScore(2.718));

      statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
      statistics->targetPhrase->Evaluate(src, GetFeaturesToApply());
    } catch (AlignmentException& e) {
      continue;
    }

    ret->Add(new TargetPhrase(*statistics->targetPhrase));
  }

  RemoveAllInMap(*allStats);
  delete allStats;
  return ret;
}
float ChartTranslationOptions::CalcEstimateOfBestScore(
  const TargetPhraseCollection &tpc,
  const StackVec &stackVec)
{
  const TargetPhrase &targetPhrase = **(tpc.begin());
  float estimateOfBestScore = targetPhrase.GetFutureScore();
  for (StackVec::const_iterator p = stackVec.begin(); p != stackVec.end();
       ++p) {
    const HypoList *stack = (*p)->GetStack().cube;
    assert(stack);
    assert(!stack->empty());
    const ChartHypothesis &bestHypo = **(stack->begin());
    estimateOfBestScore += bestHypo.GetTotalScore();
  }
  return estimateOfBestScore;
}
void ChartTranslationOptionList::Add(const TargetPhraseCollection &tpc,
                                     const StackVec &stackVec,
                                     const WordsRange &range)
{
  if (tpc.IsEmpty()) {
    return;
  }

  float score = ChartTranslationOptions::CalcEstimateOfBestScore(tpc, stackVec);

  // If the rule limit has already been reached then don't add the option
  // unless it is better than at least one existing option.
  if (m_size > m_ruleLimit && score < m_scoreThreshold) {
    return;
  }

  // Add the option to the list.
  if (m_size == m_collection.size()) {
    // m_collection has reached capacity: create a new object.
    m_collection.push_back(new ChartTranslationOptions(tpc, stackVec,
                                                      range, score));
  } else {
    // Overwrite an unused object.
    *(m_collection[m_size]) = ChartTranslationOptions(tpc, stackVec,
                                                     range, score);
  }
  ++m_size;

  // If the rule limit hasn't been exceeded then update the threshold.
  if (m_size <= m_ruleLimit) {
    m_scoreThreshold = (score < m_scoreThreshold) ? score : m_scoreThreshold;
  }

  // Prune if bursting
  if (m_size == m_ruleLimit * 2) {
    std::nth_element(m_collection.begin(),
                     m_collection.begin() + m_ruleLimit - 1,
                     m_collection.begin() + m_size,
                     ChartTranslationOptionOrderer());
    m_scoreThreshold = m_collection[m_ruleLimit-1]->GetEstimateOfBestScore();
    m_size = m_ruleLimit;
  }
}
void PhraseDictionaryNewFormat::AddEquivPhrase(TargetPhraseCollection	&targetPhraseColl, TargetPhrase *targetPhrase)
{
	targetPhraseColl.Add(targetPhrase);
}
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
                                  , const std::vector<FactorType> &output
                                  , const string &filePath
                                  , const vector<float> &weight
                                  , size_t tableLimit
                                  , const LMList &languageModels
                                  , float weightWP)
{
    const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing();

    const StaticData &staticData = StaticData::Instance();

    m_tableLimit = tableLimit;

    util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL);

    size_t line_num = 0;
    size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
    const std::string& factorDelimiter = staticData.GetFactorDelimiter();

    Phrase sourcePhrase(0);
    std::vector<float> scv;
    scv.reserve(m_numScoreComponent);

    TargetPhraseCollection *preSourceNode = NULL;
    std::string preSourceString;

    while(true) {
        ++line_num;
        StringPiece line;
        try {
            line = inFile.ReadLine();
        } catch (util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||"));
        StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num));
        StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num));
        StringPiece scoreString(GrabOrDie(pipes, filePath, line_num));

        bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t"));
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n");
            continue;
        }

        //target
        std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase());
        targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter);

        scv.clear();
        for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) {
            char *err_ind;
            // Token is always delimited by some form of space.  Also, apparently strtod is portable but strtof isn't.
            scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind)))));
            if (err_ind == token->data()) {
                stringstream strme;
                strme << "Bad number " << token << " on line " << line_num;
                UserMessage::Add(strme.str());
                abort();
            }
        }
        if (scv.size() != m_numScoreComponent) {
            stringstream strme;
            strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
            UserMessage::Add(strme.str());
            abort();
        }



        size_t consumed = 3;
        if (pipes) {
            targetPhrase->SetAlignmentInfo(*pipes++);
            ++consumed;
        }

        ScoreComponentCollection sparse;
        if (pipes) pipes++; //counts
        if (pipes) {
            //sparse features
            SparsePhraseDictionaryFeature* spdf =
                GetFeature()->GetSparsePhraseDictionaryFeature();
            if (spdf) {
                sparse.Assign(spdf,(pipes++)->as_string());
            }
        }


        // scv good to go sir!
        targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels);

        // Check number of entries delimited by ||| agrees across all lines.
        for (; pipes; ++pipes, ++consumed) {}
        if (numElement != consumed) {
            if (numElement == NOT_FOUND) {
                numElement = consumed;
            } else {
                stringstream strme;
                strme << "Syntax error at " << filePath << ":" << line_num;
                UserMessage::Add(strme.str());
                abort();
            }
        }

        //TODO: Would be better to reuse source phrases, but ownership has to be
        //consistent across phrase table implementations
        sourcePhrase.Clear();
        sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter);
        //Now that the source phrase is ready, we give the target phrase a copy
        targetPhrase->SetSourcePhrase(sourcePhrase);
        if (preSourceString == sourcePhraseString && preSourceNode) {
            preSourceNode->Add(targetPhrase.release());
        } else {
            preSourceNode = CreateTargetPhraseCollection(sourcePhrase);
            preSourceNode->Add(targetPhrase.release());
            preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size());
        }
    }

    // sort each target phrase collection
    m_collection.Sort(m_tableLimit);

    /* // TODO ASK OLIVER WHY THIS IS NEEDED
    const_cast<LMList&>(languageModels).CleanUpAfterSentenceProcessing();
    */

    return true;
}
Example #21
0
//! populate this InputType with data from in stream
int TreeInput::Read(std::istream& in,const std::vector<FactorType>& factorOrder)
{
  const StaticData &staticData = StaticData::Instance();

  string line;
  if (getline(in, line, '\n').eof())
    return 0;
  // remove extra spaces
  //line = Trim(line);

  std::vector<XMLParseOutput> sourceLabels;
  std::vector<XmlOption*> xmlOptionsList;
  ProcessAndStripXMLTags(line, sourceLabels, xmlOptionsList);

  // do words 1st - hack
  stringstream strme;
  strme << line << endl;

  Sentence::Read(strme, factorOrder);

  // size input chart
  size_t sourceSize = GetSize();
  m_sourceChart.resize(sourceSize);

  for (size_t pos = 0; pos < sourceSize; ++pos) {
    m_sourceChart[pos].resize(sourceSize - pos);
  }

  // do source labels
  vector<XMLParseOutput>::const_iterator iterLabel;
  for (iterLabel = sourceLabels.begin(); iterLabel != sourceLabels.end(); ++iterLabel) {
    const XMLParseOutput &labelItem = *iterLabel;
    const WordsRange &range = labelItem.m_range;
    const string &label = labelItem.m_label;
    AddChartLabel(range.GetStartPos() + 1, range.GetEndPos() + 1, label, factorOrder);
  }

  // default label
  for (size_t startPos = 0; startPos < sourceSize; ++startPos) {
    for (size_t endPos = startPos; endPos < sourceSize; ++endPos) {
      AddChartLabel(startPos, endPos, staticData.GetInputDefaultNonTerminal(), factorOrder);
    }
  }

  // XML Options

  //only fill the vector if we are parsing XML
  if (staticData.GetXmlInputType() != XmlPassThrough ) {
    //TODO: needed to handle exclusive
    //for (size_t i=0; i<GetSize(); i++) {
    //  m_xmlCoverageMap.push_back(false);
    //}

    //iterXMLOpts will be empty for XmlIgnore
    //look at each column
    for(std::vector<XmlOption*>::const_iterator iterXmlOpts = xmlOptionsList.begin();
        iterXmlOpts != xmlOptionsList.end(); iterXmlOpts++) {

      const XmlOption *xmlOption = *iterXmlOpts;
      TargetPhrase *targetPhrase = new TargetPhrase(xmlOption->targetPhrase);
      *targetPhrase = xmlOption->targetPhrase; // copy everything
      WordsRange *range = new WordsRange(xmlOption->range);
      const StackVec emptyStackVec; // hmmm... maybe dangerous, but it is never consulted

      TargetPhraseCollection *tpc = new TargetPhraseCollection;
      tpc->Add(targetPhrase);

      ChartTranslationOptions *transOpt = new ChartTranslationOptions(*tpc, emptyStackVec, *range, 0.0f);
      m_xmlChartOptionsList.push_back(transOpt);

      //TODO: needed to handle exclusive
      //for(size_t j=transOpt->GetSourceWordsRange().GetStartPos(); j<=transOpt->GetSourceWordsRange().GetEndPos(); j++) {
      //  m_xmlCoverageMap[j]=true;
      //}

      delete xmlOption;
    }

  }

  return 1;
}