Ejemplo n.º 1
0
float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const
{
  const Sentence& input = *(m_local->input);
  float score = 0;
  for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) {
    float sum = 0;
    const Word& targetWord = targetPhrase.GetWord( targetIndex );
    VERBOSE(2,"glm " << targetWord << ": ");
    const DoubleHash::const_iterator targetWordHash = m_hash.find( &targetWord );
    if( targetWordHash != m_hash.end() ) {
      SingleHash::const_iterator inputWordHash = targetWordHash->second.find( m_bias );
      if( inputWordHash != targetWordHash->second.end() ) {
        VERBOSE(2,"*BIAS* " << inputWordHash->second);
        sum += inputWordHash->second;
      }

      set< const Word*, WordComparer > alreadyScored; // do not score a word twice
      for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) {
        const Word& inputWord = input.GetWord( inputIndex );
        if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) {
          SingleHash::const_iterator inputWordHash = targetWordHash->second.find( &inputWord );
          if( inputWordHash != targetWordHash->second.end() ) {
            VERBOSE(2," " << inputWord << " " << inputWordHash->second);
            sum += inputWordHash->second;
          }
          alreadyScored.insert( &inputWord );
        }
      }
    }
    // Hal Daume says: 1/( 1 + exp [ - sum_i w_i * f_i ] )
    VERBOSE(2," p=" << FloorScore( log(1/(1+exp(-sum))) ) << endl);
    score += FloorScore( log(1/(1+exp(-sum))) );
  }
  return score;
}
Ejemplo n.º 2
0
double CrossEntropyCounts::operator() ( const dlib::matrix<double,0,1>& arg) const
{
  double total = 0.0;
  double n = 0.0;
  std::vector<float> weight_vector (m_model->m_numModels);

  for (int i=0; i < arg.nr(); i++) {
    weight_vector[i] = arg(i);
  }
  if (m_model->m_mode == "interpolate") {
    weight_vector = m_model->normalizeWeights(weight_vector);
  }

  for ( std::vector<multiModelCountsStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
    multiModelCountsStatisticsOptimization* statistics = *iter;
    size_t f = statistics->f;

    double score;
    if (m_iFeature == 0) {
      score = m_model->m_combineFunction(statistics->fst, statistics->ft, weight_vector);
    } else if (m_iFeature == 1) {
      score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachee2f, weight_vector);
    } else if (m_iFeature == 2) {
      score = m_model->m_combineFunction(statistics->fst, statistics->fs, weight_vector);
    } else if (m_iFeature == 3) {
      score = m_model->ComputeWeightedLexicalTranslationFromCache(statistics->lexCachef2e, weight_vector);
    } else {
      score = 0;
      UTIL_THROW(util::Exception, "Trying to optimize feature that I don't know. Aborting");
    }
    total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
    n += f;
  }
  return total/n;
}
Ejemplo n.º 3
0
//TODO this should be a factory function!
TranslationOption::TranslationOption(const WordsRange &wordsRange
																		 , const TargetPhrase &targetPhrase
																		 , const InputType &inputType
																		 , int /*whatever*/)
: m_targetPhrase(targetPhrase)
, m_sourceWordsRange	(wordsRange)
, m_futureScore(0)
{
	const UnknownWordPenaltyProducer *up = StaticData::Instance().GetUnknownWordPenaltyProducer();
  if (up) {
		const ScoreProducer *scoreProducer = (const ScoreProducer *)up; // not sure why none of the c++ cast works
		vector<float> score(1);
		score[0] = FloorScore(-numeric_limits<float>::infinity());
		m_scoreBreakdown.Assign(scoreProducer, score);
	}

	if (inputType.GetType() == SentenceInput)
	{
		Phrase phrase = inputType.GetSubString(wordsRange);
		m_sourcePhrase = new Phrase(phrase);
	}
	else
	{ // TODO lex reordering with confusion network
		m_sourcePhrase = new Phrase(*targetPhrase.GetSourcePhrase());
		//the target phrase from a confusion network/lattice has input scores that we want to keep
		m_scoreBreakdown.PlusEquals(targetPhrase.GetScoreBreakdown());

	}
}
Ejemplo n.º 4
0
LMResult LanguageModelSRI::GetValue(VocabIndex wordId, VocabIndex *context) const
{
  LMResult ret;
  ret.score = FloorScore(TransformLMScore(m_srilmModel->wordProb( wordId, context)));
  ret.unknown = (wordId == m_unknownId);
  return ret;
}
LMResult LanguageModelParallelBackoff::GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState & /*outState */) const
{

  static WidMatrix widMatrix;

  for (int i=0; i<contextFactor.size(); i++)
    ::memset(widMatrix[i],0,(m_factorTypesOrdered.size() + 1)*sizeof(VocabIndex));


  for (size_t i = 0; i < contextFactor.size(); i++) {
    const Word &word = *contextFactor[i];

    for (size_t j = 0; j < m_factorTypesOrdered.size(); j++) {
      const Factor *factor = word[ m_factorTypesOrdered[j] ];

      if (factor == NULL)
        widMatrix[i][j + 1] = 0;
      else
        widMatrix[i][j + 1] = GetLmID(factor, j);
    }

    if (widMatrix[i][1] == GetLmID(m_sentenceStartArray[0], 0) ) {
      widMatrix[i][0] = m_wtbid;
    } else if (widMatrix[i][1] == GetLmID(m_sentenceEndArray[0], 0 )) {
      widMatrix[i][0] = m_wteid;
    } else {
      widMatrix[i][0] = m_wtid;
    }
  }


  LMResult ret;
  ret.score = m_srilmModel->wordProb( widMatrix, contextFactor.size() - 1, contextFactor.size() );
  ret.score = FloorScore(TransformLMScore(ret.score));
  ret.unknown = !contextFactor.empty() && (widMatrix[contextFactor.size() - 1][0] == m_unknownId);
  return ret;

  /*if (contextFactor.size() == 0)
  {
  	return 0;
  }

  for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos )
  {
  	const Word &word = *contextFactor[currPos];

  	for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index)
  	{
  		FactorType factorType = m_factorTypesOrdered[index];
  		const Factor *factor = word[factorType];

  		(*widMatrix)[currPos][index] = GetLmID(factor, index);

  	}

  }

  float p = m_srilmModel->wordProb( (*widMatrix), m_nGramOrder - 1, m_nGramOrder );
  return FloorScore(TransformLMScore(p)); */
}
double CrossEntropy::operator() ( const dlib::matrix<double,0,1>& arg) const
{
  double total = 0.0;
  double n = 0.0;
  std::vector<float> weight_vector (m_model->m_numModels);

  for (int i=0; i < arg.nr(); i++) {
    weight_vector[i] = arg(i);
  }
  if (m_model->m_mode == "interpolate") {
    weight_vector = m_model->normalizeWeights(weight_vector);
  }

  for ( std::vector<multiModelStatisticsOptimization*>::const_iterator iter = m_optimizerStats.begin(); iter != m_optimizerStats.end(); ++iter ) {
    multiModelStatisticsOptimization* statistics = *iter;
    size_t f = statistics->f;

    double score;
    score = std::inner_product(statistics->p[m_iFeature].begin(), statistics->p[m_iFeature].end(), weight_vector.begin(), 0.0);

    total -= (FloorScore(TransformScore(score))/TransformScore(2))*f;
    n += f;
  }
  return total/n;
}
Ejemplo n.º 7
0
void PhraseTableCreator::EncodeScores(std::vector<float>& scores, std::ostream& os)
{
  size_t c = 0;
  float score;

  while(c < scores.size()) {
    score = scores[c];
    score = FloorScore(TransformScore(score));
    os.write((char*)&score, sizeof(score));
    m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
    c++;
  }
}
Ejemplo n.º 8
0
TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseCollectionCounts(const Phrase &src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats, vector<vector<float> > &multimodelweights) const
{
  TargetPhraseCollection *ret = new TargetPhraseCollection();
  for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) {

    multiModelCountsStatistics * statistics = iter->second;

    if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) {
      UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables.");
    }

    try {
      pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm());
      vector< set<size_t> > alignedToT = alignment.first;
      vector< set<size_t> > alignedToS = alignment.second;
      double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], false );
      double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], true );

      Scores scoreVector(5);
      scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0])));
      scoreVector[1] = FloorScore(TransformScore(lexst));
      scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2])));
      scoreVector[3] = FloorScore(TransformScore(lexts));
      scoreVector[4] = FloorScore(TransformScore(2.718));

      statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
      statistics->targetPhrase->Evaluate(src, GetFeaturesToApply());
    } catch (AlignmentException& e) {
      continue;
    }

    ret->Add(new TargetPhrase(*statistics->targetPhrase));
  }

  RemoveAllInMap(*allStats);
  delete allStats;
  return ret;
}
//TODO this should be a factory function!
TranslationOption::TranslationOption(const WordsRange &wordsRange
                                     , const TargetPhrase &targetPhrase
                                     , const InputType &inputType
                                     , const UnknownWordPenaltyProducer* up)
  : m_targetPhrase(targetPhrase)
  , m_sourceWordsRange	(wordsRange)
  , m_futureScore(0)
{
  if (up) {
		const ScoreProducer *scoreProducer = (const ScoreProducer *)up; // not sure why none of the c++ cast works
		vector<float> score(1);
		score[0] = FloorScore(-numeric_limits<float>::infinity());
		m_scoreBreakdown.Assign(scoreProducer, score);
	}
}
Ejemplo n.º 10
0
float LanguageModelInternal::GetValue(const Factor *factor0, State* finalState) const
{
	float prob;
	const NGramNode *nGram		= GetLmID(factor0);
	if (nGram == NULL)
	{
		if (finalState != NULL)
			*finalState = NULL;
		prob = -numeric_limits<float>::infinity();
	}
	else
	{
		if (finalState != NULL)
			*finalState = static_cast<const void*>(nGram);
		prob = nGram->GetScore();
	}
	return FloorScore(prob);
}
Ejemplo n.º 11
0
float LanguageModelInternal::GetValue(const Factor *factor0, const Factor *factor1, State* finalState) const
{
	float score;
	const NGramNode *nGram[2];

	nGram[1]		= GetLmID(factor1);
	if (nGram[1] == NULL)
	{
		if (finalState != NULL)
			*finalState = NULL;
		score = -numeric_limits<float>::infinity();
	}
	else
	{
		nGram[0] = nGram[1]->GetNGram(factor0);
		if (nGram[0] == NULL)
		{ // something unigram
			if (finalState != NULL)
				*finalState = static_cast<const void*>(nGram[1]);
			
			nGram[0]	= GetLmID(factor0);
			if (nGram[0] == NULL)
			{ // stops at unigram
				score = nGram[1]->GetScore();
			}
			else
			{	// unigram unigram
				score = nGram[1]->GetScore() + nGram[0]->GetLogBackOff();
			}
		}
		else
		{ // bigram
			if (finalState != NULL)
				*finalState = static_cast<const void*>(nGram[0]);
			score			= nGram[0]->GetScore();
		}
	}

	return FloorScore(score);

}
Ejemplo n.º 12
0
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
                                       State* finalState) const
{
    FactorType factorType = GetFactorType();
    // set up context
    randlm::WordID ngram[MAX_NGRAM_SIZE];
    int count = contextFactor.size();
    for (int i = 0 ; i < count ; i++) {
        ngram[i] = GetLmID((*contextFactor[i])[factorType]);
        //std::cerr << m_lm->getWord(ngram[i]) << " ";
    }
    int found = 0;
    LMResult ret;
    ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, &found, finalState)));
    ret.unknown = count && (ngram[count - 1] == m_oov_id);
    //if (finalState)
    //  std::cerr << " = " << logprob << "(" << *finalState << ", " <<")"<< std::endl;
    //else
    //  std::cerr << " = " << logprob << std::endl;
    return ret;
}
Ejemplo n.º 13
0
std::string LexicalReorderingTableCreator::EncodeLine(std::vector<std::string>& tokens)
{
  std::string scoresString = tokens.back();
  std::stringstream scoresStream;
  
  std::vector<float> scores;
  Tokenize<float>(scores, scoresString);
  
  if(!m_numScoreComponent) {
    m_numScoreComponent = scores.size();
    m_scoreCounters.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
    for(std::vector<ScoreCounter*>::iterator it = m_scoreCounters.begin();
        it != m_scoreCounters.end(); it++)
        *it = new ScoreCounter();
    m_scoreTrees.resize(m_multipleScoreTrees ? m_numScoreComponent : 1);
  }
  
  if(m_numScoreComponent != scores.size()) {
    std::cerr << "Error: Wrong number of scores detected ("
      << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl;
    std::cerr << "Line: " << tokens[0] << " ||| ... ||| " << scoresString << std::endl;
	LOGE("[mgjang] before abort\n");
    abort();  
  }
  
  size_t c = 0;
  float score;
  while(c < m_numScoreComponent)
  {
    score = scores[c];
    score = FloorScore(TransformScore(score));
    scoresStream.write((char*)&score, sizeof(score));
    
    m_scoreCounters[m_multipleScoreTrees ? c : 0]->Increase(score);
    c++;
  }
  
  return scoresStream.str();
}
Ejemplo n.º 14
0
bool RuleTableLoaderStandard::Load(FormatType format
                                   , const std::vector<FactorType> &input
                                   , const std::vector<FactorType> &output
                                   , const std::string &inFile
                                   , size_t /* tableLimit */
                                   , RuleTableTrie &ruleTable)
{
  PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();

  string lineOrig;
  size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  vector<float> scoreVector;
  StringPiece line;
  std::string hiero_before, hiero_after;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    if (format == HieroFormat) { // inefficiently reformat line
      hiero_before.assign(line.data(), line.size());
      ReformatHieroRule(hiero_before, hiero_after);
      line = hiero_after;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourcePhraseString(*pipes);
    StringPiece targetPhraseString(*++pipes);
    StringPiece scoreString(*++pipes);

    StringPiece alignString;
    if (++pipes) {
      StringPiece temp(*pipes);
      alignString = temp;
    }

    if (++pipes) {
      StringPiece str(*pipes); //counts
    }

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      int processed;
      float score = converter.StringToFloat(s->data(), s->length(), &processed);
      UTIL_THROW_IF(isnan(score), util::Exception, "Bad score " << *s << " on line " << count);
      scoreVector.push_back(FloorScore(TransformScore(score)));
    }
    const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }

    // parse source & find pt node

    // constituent labels
    Word *sourceLHS;
    Word *targetLHS;

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase();
    targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);

    // source
    Phrase sourcePhrase;
    sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString);
    targetPhrase->SetTargetLHS(targetLHS);

    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

    if (++pipes) {
      StringPiece sparseString(*pipes);
      targetPhrase->SetSparseScore(&ruleTable, sparseString);
    }

    if (++pipes) {
      StringPiece propertiesString(*pipes);
      targetPhrase->SetProperties(propertiesString);
    }

    targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
    targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  SortAndPrune(ruleTable);

  return true;
}
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
                                  , const std::vector<FactorType> &output
                                  , const string &filePath
                                  , const vector<float> &weight
                                  , size_t tableLimit
                                  , const LMList &languageModels
                                  , float weightWP)
{
    const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing();

    const StaticData &staticData = StaticData::Instance();

    m_tableLimit = tableLimit;

    util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL);

    size_t line_num = 0;
    size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
    const std::string& factorDelimiter = staticData.GetFactorDelimiter();

    Phrase sourcePhrase(0);
    std::vector<float> scv;
    scv.reserve(m_numScoreComponent);

    TargetPhraseCollection *preSourceNode = NULL;
    std::string preSourceString;

    while(true) {
        ++line_num;
        StringPiece line;
        try {
            line = inFile.ReadLine();
        } catch (util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||"));
        StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num));
        StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num));
        StringPiece scoreString(GrabOrDie(pipes, filePath, line_num));

        bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t"));
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n");
            continue;
        }

        //target
        std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase());
        targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter);

        scv.clear();
        for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) {
            char *err_ind;
            // Token is always delimited by some form of space.  Also, apparently strtod is portable but strtof isn't.
            scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind)))));
            if (err_ind == token->data()) {
                stringstream strme;
                strme << "Bad number " << token << " on line " << line_num;
                UserMessage::Add(strme.str());
                abort();
            }
        }
        if (scv.size() != m_numScoreComponent) {
            stringstream strme;
            strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
            UserMessage::Add(strme.str());
            abort();
        }



        size_t consumed = 3;
        if (pipes) {
            targetPhrase->SetAlignmentInfo(*pipes++);
            ++consumed;
        }

        ScoreComponentCollection sparse;
        if (pipes) pipes++; //counts
        if (pipes) {
            //sparse features
            SparsePhraseDictionaryFeature* spdf =
                GetFeature()->GetSparsePhraseDictionaryFeature();
            if (spdf) {
                sparse.Assign(spdf,(pipes++)->as_string());
            }
        }


        // scv good to go sir!
        targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels);

        // Check number of entries delimited by ||| agrees across all lines.
        for (; pipes; ++pipes, ++consumed) {}
        if (numElement != consumed) {
            if (numElement == NOT_FOUND) {
                numElement = consumed;
            } else {
                stringstream strme;
                strme << "Syntax error at " << filePath << ":" << line_num;
                UserMessage::Add(strme.str());
                abort();
            }
        }

        //TODO: Would be better to reuse source phrases, but ownership has to be
        //consistent across phrase table implementations
        sourcePhrase.Clear();
        sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter);
        //Now that the source phrase is ready, we give the target phrase a copy
        targetPhrase->SetSourcePhrase(sourcePhrase);
        if (preSourceString == sourcePhraseString && preSourceNode) {
            preSourceNode->Add(targetPhrase.release());
        } else {
            preSourceNode = CreateTargetPhraseCollection(sourcePhrase);
            preSourceNode->Add(targetPhrase.release());
            preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size());
        }
    }

    // sort each target phrase collection
    m_collection.Sort(m_tableLimit);

    /* // TODO ASK OLIVER WHY THIS IS NEEDED
    const_cast<LMList&>(languageModels).CleanUpAfterSentenceProcessing();
    */

    return true;
}
Ejemplo n.º 16
0
bool HyperTreeLoader::Load(AllOptions const& opts,
                           const std::vector<FactorType> &input,
                           const std::vector<FactorType> &output,
                           const std::string &inFile,
                           const RuleTableFF &ff,
                           HyperTree &trie,
                           boost::unordered_set<std::size_t> &sourceTermSet)
{
  PrintUserTime(std::string("Start loading HyperTree"));

  sourceTermSet.clear();

  std::size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  std::vector<float> scoreVector;
  StringPiece line;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  HyperPathLoader hyperPathLoader;

  Phrase dummySourcePhrase;
  {
    Word *lhs = NULL;
    dummySourcePhrase.CreateFromString(Input, input, "hello", &lhs);
    delete lhs;
  }

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourceString(*pipes);
    StringPiece targetString(*++pipes);
    StringPiece scoreString(*++pipes);

    StringPiece alignString;
    if (++pipes) {
      StringPiece temp(*pipes);
      alignString = temp;
    }

    ++pipes;  // counts

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      int processed;
      float score = converter.StringToFloat(s->data(), s->length(), &processed);
      UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
      scoreVector.push_back(FloorScore(TransformScore(score)));
    }
    const std::size_t numScoreComponents = ff.GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                  << numScoreComponents << ") of score components on line " << count);
    }

    // Source-side
    HyperPath sourceFragment;
    hyperPathLoader.Load(sourceString, sourceFragment);
    ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet);

    // Target-side
    TargetPhrase *targetPhrase = new TargetPhrase(&ff);
    Word *targetLHS = NULL;
    targetPhrase->CreateFromString(Output, output, targetString, &targetLHS);
    targetPhrase->SetTargetLHS(targetLHS);
    targetPhrase->SetAlignmentInfo(alignString);

    if (++pipes) {
      StringPiece sparseString(*pipes);
      targetPhrase->SetSparseScore(&ff, sparseString);
    }

    if (++pipes) {
      StringPiece propertiesString(*pipes);
      targetPhrase->SetProperties(propertiesString);
    }

    targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
    targetPhrase->EvaluateInIsolation(dummySourcePhrase,
                                      ff.GetFeaturesToApply());

    // Add rule to trie.
    TargetPhraseCollection::shared_ptr phraseColl
    = GetOrCreateTargetPhraseCollection(trie, sourceFragment);
    phraseColl->Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  if (ff.GetTableLimit()) {
    SortAndPrune(trie, ff.GetTableLimit());
  }

  return true;
}
Ejemplo n.º 17
0
void GenerationDictionary::Load()
{
  FactorCollection &factorCollection = FactorCollection::Instance();

  const size_t numFeatureValuesInConfig = this->GetNumScoreComponents();


  // data from file
  InputFileStream inFile(m_filePath);
  UTIL_THROW_IF2(!inFile.good(), "Couldn't read " << m_filePath);

  string line;
  size_t lineNum = 0;
  while(getline(inFile, line)) {
    ++lineNum;
    vector<string> token = Tokenize( line );

    // add each line in generation file into class
    Word *inputWord = new Word();  // deleted in destructor
    Word outputWord;

    // create word with certain factors filled out

    // inputs
    vector<string> factorString = Tokenize( token[0], "|" );
    for (size_t i = 0 ; i < GetInput().size() ; i++) {
      FactorType factorType = GetInput()[i];
      const Factor *factor = factorCollection.AddFactor( Output, factorType, factorString[i]);
      inputWord->SetFactor(factorType, factor);
    }

    factorString = Tokenize( token[1], "|" );
    for (size_t i = 0 ; i < GetOutput().size() ; i++) {
      FactorType factorType = GetOutput()[i];

      const Factor *factor = factorCollection.AddFactor( Output, factorType, factorString[i]);
      outputWord.SetFactor(factorType, factor);
    }

    size_t numFeaturesInFile = token.size() - 2;
    if (numFeaturesInFile < numFeatureValuesInConfig) {
      stringstream strme;
      strme << m_filePath << ":" << lineNum << ": expected " << numFeatureValuesInConfig
            << " feature values, but found " << numFeaturesInFile << std::endl;
      throw strme.str();
    }
    std::vector<float> scores(numFeatureValuesInConfig, 0.0f);
    for (size_t i = 0; i < numFeatureValuesInConfig; i++)
      scores[i] = FloorScore(TransformScore(Scan<float>(token[2+i])));

    Collection::iterator iterWord = m_collection.find(inputWord);
    if (iterWord == m_collection.end()) {
      m_collection[inputWord][outputWord].Assign(this, scores);
    } else {
      // source word already in there. delete input word to avoid mem leak
      (iterWord->second)[outputWord].Assign(this, scores);
      delete inputWord;
    }
  }

  inFile.Close();
}
bool RuleTrieLoader::Load(const std::vector<FactorType> &input,
                          const std::vector<FactorType> &output,
                          const std::string &inFile,
                          const RuleTableFF &ff,
                          RuleTrie &trie)
{
    PrintUserTime(std::string("Start loading text phrase table. Moses format"));

    const StaticData &staticData = StaticData::Instance();
    // const std::string &factorDelimiter = staticData.GetFactorDelimiter();

    std::size_t count = 0;

    std::ostream *progress = NULL;
    IFVERBOSE(1) progress = &std::cerr;
    util::FilePiece in(inFile.c_str(), progress);

    // reused variables
    std::vector<float> scoreVector;
    StringPiece line;

    double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

    while(true) {
        try {
            line = in.ReadLine();
        } catch (const util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, "|||");
        StringPiece sourcePhraseString(*pipes);
        StringPiece targetPhraseString(*++pipes);
        StringPiece scoreString(*++pipes);

        StringPiece alignString;
        if (++pipes) {
            StringPiece temp(*pipes);
            alignString = temp;
        }

        if (++pipes) {
            StringPiece str(*pipes); //counts
        }

        bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos);
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
            continue;
        }

        scoreVector.clear();
        for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
            int processed;
            float score = converter.StringToFloat(s->data(), s->length(), &processed);
            UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
            scoreVector.push_back(FloorScore(TransformScore(score)));
        }
        const std::size_t numScoreComponents = ff.GetNumScoreComponents();
        if (scoreVector.size() != numScoreComponents) {
            UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                        << numScoreComponents << ") of score components on line " << count);
        }

        // parse source & find pt node

        // constituent labels
        Word *sourceLHS = NULL;
        Word *targetLHS;

        // create target phrase obj
        TargetPhrase *targetPhrase = new TargetPhrase(&ff);
        // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);
        targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
        // source
        Phrase sourcePhrase;
        // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);
        sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);

        // rest of target phrase
        targetPhrase->SetAlignmentInfo(alignString);
        targetPhrase->SetTargetLHS(targetLHS);

        //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

        if (++pipes) {
            StringPiece sparseString(*pipes);
            targetPhrase->SetSparseScore(&ff, sparseString);
        }

        if (++pipes) {
            StringPiece propertiesString(*pipes);
            targetPhrase->SetProperties(propertiesString);
        }

        targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
        targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply());

        TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(
                trie, *sourceLHS, sourcePhrase);
        phraseColl.Add(targetPhrase);

        // not implemented correctly in memory pt. just delete it for now
        delete sourceLHS;

        count++;
    }

    // sort and prune each target phrase collection
    if (ff.GetTableLimit()) {
        SortAndPrune(trie, ff.GetTableLimit());
    }

    return true;
}
Ejemplo n.º 19
0
LMResult LanguageModelRemote::GetValue(const std::vector<const Word*> &contextFactor, State* finalState) const
{
  LMResult ret;
  ret.unknown = false;
  size_t count = contextFactor.size();
  if (count == 0) {
    if (finalState) *finalState = NULL;
    ret.score = 0.0;
    return ret;
  }
  //std::cerr << "contextFactor.size() = " << count << "\n";
  size_t max = m_nGramOrder;
  const FactorType factor = GetFactorType();
  if (max > count) max = count;

  Cache* cur = &m_cache;
  int pc = static_cast<int>(count) - 1;
  for (int i = 0; i < pc; ++i) {
    const Factor* f = contextFactor[i]->GetFactor(factor);
    cur = &cur->tree[f ? f : BOS];
  }
  const Factor* event_word = contextFactor[pc]->GetFactor(factor);
  cur = &cur->tree[event_word ? event_word : EOS];
  if (cur->prob) {
    if (finalState) *finalState = cur->boState;
    ret.score = cur->prob;
    return ret;
  }
  cur->boState = *reinterpret_cast<const State*>(&m_curId);
  ++m_curId;

  std::ostringstream os;
  os << "prob ";
  if (event_word == NULL) {
    os << "</s>";
  } else {
    os << event_word->GetString();
  }
  for (size_t i=1; i<max; i++) {
    const Factor* f = contextFactor[count-1-i]->GetFactor(factor);
    if (f == NULL) {
      os << " <s>";
    } else {
      os << ' ' << f->GetString();
    }
  }
  os << std::endl;
  std::string out = os.str();
  write(sock, out.c_str(), out.size());
  char res[6];
  int r = read(sock, res, 6);
  int errors = 0;
  int cnt = 0;
  while (1) {
    if (r < 0) {
      errors++;
      sleep(1);
      //std::cerr << "Error: read()\n";
      if (errors > 5) exit(1);
    } else if (r==0 || res[cnt] == '\n') {
      break;
    } else {
      cnt += r;
      if (cnt==6) break;
      read(sock, &res[cnt], 6-cnt);
    }
  }
  cur->prob = FloorScore(TransformLMScore(*reinterpret_cast<float*>(res)));
  if (finalState) {
    *finalState = cur->boState;
  }
  ret.score = cur->prob;
  return ret;
}
Ejemplo n.º 20
0
bool RuleTableLoaderCompact::LoadRuleSection(
  LineReader &reader,
  const std::vector<Word> &vocab,
  const std::vector<Phrase> &sourcePhrases,
  const std::vector<Phrase> &targetPhrases,
  const std::vector<size_t> &targetLhsIds,
  const std::vector<const AlignmentInfo *> &alignmentSets,
  RuleTableTrie &ruleTable)
{
  // Read rule count.
  reader.ReadLine();
  const size_t ruleCount = std::atoi(reader.m_line.c_str());

  // Read rules and add to table.
  const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
  std::vector<float> scoreVector(numScoreComponents);
  std::vector<size_t> tokenPositions;
  for (size_t i = 0; i < ruleCount; ++i) {
    reader.ReadLine();

    tokenPositions.clear();
    FindTokens(tokenPositions, reader.m_line);

    const char *charLine = reader.m_line.c_str();

    // The first three tokens are IDs for the source phrase, target phrase,
    // and alignment set.
    const int sourcePhraseId = std::atoi(charLine+tokenPositions[0]);
    const int targetPhraseId = std::atoi(charLine+tokenPositions[1]);
    const int alignmentSetId = std::atoi(charLine+tokenPositions[2]);

    const Phrase &sourcePhrase = sourcePhrases[sourcePhraseId];
    const Phrase &targetPhrasePhrase = targetPhrases[targetPhraseId];
    const Word *targetLhs = new Word(vocab[targetLhsIds[targetPhraseId]]);
    Word sourceLHS("X"); // TODO not implemented for compact
    const AlignmentInfo *alignNonTerm = alignmentSets[alignmentSetId];

    // Then there should be one score for each score component.
    for (size_t j = 0; j < numScoreComponents; ++j) {
      float score = std::atof(charLine+tokenPositions[3+j]);
      scoreVector[j] = FloorScore(TransformScore(score));
    }
    if (reader.m_line[tokenPositions[3+numScoreComponents]] != ':') {
      std::stringstream msg;
      msg << "Size of scoreVector != number ("
          << scoreVector.size() << "!=" << numScoreComponents
          << ") of score components on line " << reader.m_lineNum;
      UserMessage::Add(msg.str());
      return false;
    }

    // The remaining columns are currently ignored.

    // Create and score target phrase.
    TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase);
    targetPhrase->SetAlignNonTerm(alignNonTerm);
    targetPhrase->SetTargetLHS(targetLhs);
    targetPhrase->SetSourcePhrase(sourcePhrase);

    targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());

    // Insert rule into table.
    TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection(
                                     ruleTable, sourcePhrase, *targetPhrase, &sourceLHS);
    coll.Add(targetPhrase);
  }

  return true;
}
Ejemplo n.º 21
0
/**
 * Process a sentence with xml annotation
 * Xml tags may specifiy additional/replacing translation options
 * and reordering constraints
 *
 * \param line in: sentence, out: sentence without the xml
 * \param res vector with translation options specified by xml
 * \param reorderingConstraint reordering constraint zones specified by xml
 * \param walls reordering constraint walls specified by xml
 * \param lbrackStr xml tag's left bracket string, typically "<"
 * \param rbrackStr xml tag's right bracket string, typically ">"
 */
bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls,
                            std::vector< std::pair<size_t, std::string> > &placeholders,
                            int offset,
                            const std::string& lbrackStr, const std::string& rbrackStr)
{
  //parse XML markup in translation line

  const StaticData &staticData = StaticData::Instance();

  // hack. What pt should XML trans opt be assigned to?
  PhraseDictionary *firstPt = NULL;
  if (PhraseDictionary::GetColl().size() == 0) {
    firstPt = PhraseDictionary::GetColl()[0];
  }

  // no xml tag? we're done.
//if (line.find_first_of('<') == string::npos) {
  if (line.find(lbrackStr) == string::npos) {
    return true;
  }

  // break up input into a vector of xml tags and text
  // example: (this), (<b>), (is a), (</b>), (test .)
  vector<string> xmlTokens = TokenizeXml(line, lbrackStr, rbrackStr);

  // we need to store opened tags, until they are closed
  // tags are stored as tripled (tagname, startpos, contents)
  typedef pair< string, pair< size_t, string > > OpenedTag;
  vector< OpenedTag > tagStack; // stack that contains active opened tags

  string cleanLine; // return string (text without xml)
  size_t wordPos = 0; // position in sentence (in terms of number of words)

  const vector<FactorType> &outputFactorOrder = staticData.GetOutputFactorOrder();
  // const string &factorDelimiter = staticData.GetFactorDelimiter();

  // loop through the tokens
  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
    // not a xml tag, but regular text (may contain many words)
    if(!isXmlTag(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)) {
      // add a space at boundary, if necessary
      if (cleanLine.size()>0 &&
          cleanLine[cleanLine.size() - 1] != ' ' &&
          xmlTokens[xmlTokenPos][0] != ' ') {
        cleanLine += " ";
      }
      cleanLine += xmlTokens[xmlTokenPos]; // add to output
      wordPos = Tokenize(cleanLine).size(); // count all the words
    }

    // process xml tag
    else {
      // *** get essential information about tag ***

      // strip extra boundary spaces and "<" and ">"
      string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr));
      VERBOSE(3,"XML TAG IS: " << tag << std::endl);

      if (tag.size() == 0) {
        TRACE_ERR("ERROR: empty tag name: " << line << endl);
        return false;
      }

      // check if unary (e.g., "<wall/>")
      bool isUnary = ( tag[tag.size() - 1] == '/' );

      // check if opening tag (e.g. "<a>", not "</a>")g
      bool isClosed = ( tag[0] == '/' );
      bool isOpen = !isClosed;

      if (isClosed && isUnary) {
        TRACE_ERR("ERROR: can't have both closed and unary tag " << lbrackStr << tag << rbrackStr << ": " << line << endl);
        return false;
      }

      if (isClosed)
        tag = tag.substr(1); // remove "/" at the beginning
      if (isUnary)
        tag = tag.substr(0,tag.size()-1); // remove "/" at the end

      // find the tag name and contents
      string::size_type endOfName = tag.find_first_of(' ');
      string tagName = tag;
      string tagContent = "";
      if (endOfName != string::npos) {
        tagName = tag.substr(0,endOfName);
        tagContent = tag.substr(endOfName+1);
      }

      // *** process new tag ***

      if (isOpen || isUnary) {
        // put the tag on the tag stack
        OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
        tagStack.push_back( openedTag );
        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
      }

      // *** process completed tag ***

      if (isClosed || isUnary) {
        // pop last opened tag from stack;
        if (tagStack.size() == 0) {
          TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
          return false;
        }
        OpenedTag openedTag = tagStack.back();
        tagStack.pop_back();

        // tag names have to match
        if (openedTag.first != tagName) {
          TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
          return false;
        }

        // assemble remaining information about tag
        size_t startPos = openedTag.second.first;
        string tagContent = openedTag.second.second;
        size_t endPos = wordPos;

        // span attribute overwrites position
        string span = ParseXmlTagAttribute(tagContent,"span");
        if (! span.empty()) {
          vector<string> ij = Tokenize(span, "-");
          if (ij.size() != 1 && ij.size() != 2) {
            TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
            return false;
          }
          startPos = atoi(ij[0].c_str());
          if (ij.size() == 1) endPos = startPos + 1;
          else endPos = atoi(ij[1].c_str()) + 1;
        }

        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);

        // special tag: wall
        if (tagName == "wall") {
          size_t start = (startPos == 0) ? 0 : startPos-1;
          for(size_t pos = start; pos < endPos; pos++)
            walls.push_back( pos );
        }

        // special tag: zone
        else if (tagName == "zone") {
          if (startPos >= endPos) {
            TRACE_ERR("ERROR: zone must span at least one word: " << line << endl);
            return false;
          }
          reorderingConstraint.SetZone( startPos, endPos-1 );
        }

        // name-entity placeholder
        else if (tagName == "ne") {
          if (startPos != (endPos - 1)) {
            TRACE_ERR("ERROR: Placeholder must only span 1 word: " << line << endl);
            return false;
          }
          string entity = ParseXmlTagAttribute(tagContent,"entity");
          placeholders.push_back(std::pair<size_t, std::string>(startPos, entity));
        }

        // update: add new aligned sentence pair to Mmsapt identified by name
        else if (tagName == "update") {
#if PT_UG
          // get model name and aligned sentence pair
          string pdName = ParseXmlTagAttribute(tagContent,"name");
          string source = ParseXmlTagAttribute(tagContent,"source");
          string target = ParseXmlTagAttribute(tagContent,"target");
          string alignment = ParseXmlTagAttribute(tagContent,"alignment");
          // find PhraseDictionary by name
          const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl();
          PhraseDictionary* pd = NULL;
          for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) {
            PhraseDictionary* curPd = *i;
            if (curPd->GetScoreProducerDescription() == pdName) {
              pd = curPd;
              break;
            }
          }
          if (pd == NULL) {
            TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl);
            return false;
          }
          // update model
          VERBOSE(3,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl);
          Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd);
          pdsa->add(source, target, alignment);
#else
          TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl);
          return false;
#endif
        }

        // weight-overwrite: update feature weights, unspecified weights remain unchanged
        // IMPORTANT: translation models that cache phrases or apply table-limit during load
        // based on initial weights need to be reset.  Sending an empty update will do this
        // for PhraseDictionaryBitextSampling (Mmsapt) models:
        // <update name="TranslationModelName" source=" " target=" " alignment=" " />
        else if (tagName == "weight-overwrite") {

          // is a name->ff map stored anywhere so we don't have to build it every time?
          const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions();
          boost::unordered_map<string, FeatureFunction*> map;
          BOOST_FOREACH(FeatureFunction* const& ff, ffs) {
            map[ff->GetScoreProducerDescription()] = ff;
          }

          // update each weight listed
          ScoreComponentCollection allWeights = StaticData::Instance().GetAllWeights();
          boost::unordered_map<string, FeatureFunction*>::iterator ffi;
          string ffName("");
          vector<float> ffWeights;
          vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights"));
          BOOST_FOREACH(string const& tok, toks) {
            if (tok.substr(tok.size() - 1, 1) == "=") {
              // start new feature
              if (ffName != "") {
                // set previous feature weights
                if (ffi != map.end()) {
                  allWeights.Assign(ffi->second, ffWeights);
                }
                ffWeights.clear();
              }
              ffName = tok.substr(0, tok.size() - 1);
              ffi = map.find(ffName);
              if (ffi == map.end()) {
                TRACE_ERR("ERROR: No FeatureFunction with name " << ffName << ", no weight update" << endl);
              }
            } else {
              // weight for current feature
              ffWeights.push_back(Scan<float>(tok));
            }
          }
          if (ffi != map.end()) {
            allWeights.Assign(ffi->second, ffWeights);
          }
          StaticData::InstanceNonConst().SetAllWeights(allWeights);
        }

        // default: opening tag that specifies translation options
        else {
          if (startPos > endPos) {
            TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl);
            return false;
          } else if (startPos == endPos) {
            TRACE_ERR("WARNING: tag " << tagName << " 0 span: " << line << endl);
            continue;
          }

          // specified translations -> vector of phrases
          // multiple translations may be specified, separated by "||"
          vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||");
          if( altTexts.size() == 1 && altTexts[0] == "" )
            altTexts.pop_back(); // happens when nothing specified
          // deal with legacy annotations: "translation" was called "english"
          vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||");
          if (moreAltTexts.size()>1 || moreAltTexts[0] != "") {
            for(vector<string>::iterator translation=moreAltTexts.begin();
                translation != moreAltTexts.end();
                translation++) {
              string t = *translation;
              altTexts.push_back( t );
            }
          }

          // specified probabilities for the translations -> vector of probs
          vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
          if( altProbs.size() == 1 && altProbs[0] == "" )
            altProbs.pop_back(); // happens when nothing specified

          // report what we have processed so far
          VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl);
          VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl);
          VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl);
          VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl);
          if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) {
            TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl);
            return false;
          }

          // store translation options into members
          if (staticData.GetXmlInputType() != XmlIgnore) {
            // only store options if we aren't ignoring them
            for (size_t i=0; i<altTexts.size(); ++i) {
              Phrase sourcePhrase; // TODO don't know what the source phrase is

              // set default probability
              float probValue = 1;
              if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]);
              // convert from prob to log-prob
              float scoreValue = FloorScore(TransformScore(probValue));

              WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase
              TargetPhrase targetPhrase(firstPt);
              // targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);
              targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL);

              // lhs
              const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
              if (!lhsList.empty()) {
                const Factor *factor = FactorCollection::Instance().AddFactor(lhsList[0].first, true);
                Word *targetLHS = new Word(true);
                targetLHS->SetFactor(0, factor); // TODO - other factors too?
                targetPhrase.SetTargetLHS(targetLHS);
              }

              targetPhrase.SetXMLScore(scoreValue);
              targetPhrase.EvaluateInIsolation(sourcePhrase);

              XmlOption *option = new XmlOption(range,targetPhrase);
              assert(option);

              res.push_back(option);
            }
            altTexts.clear();
            altProbs.clear();
          }
        }
      }
    }
  }
Ejemplo n.º 22
0
/**
 * Process a sentence with xml annotation
 * Xml tags may specifiy additional/replacing translation options
 * and reordering constraints
 *
 * \param line in: sentence, out: sentence without the xml
 * \param res vector with translation options specified by xml
 * \param reorderingConstraint reordering constraint zones specified by xml
 * \param walls reordering constraint walls specified by xml
 */
bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels, std::vector<XmlOption*> &xmlOptions)
{
  //parse XML markup in translation line

  // no xml tag? we're done.
  if (line.find_first_of('<') == string::npos) {
    return true;
  }

  // break up input into a vector of xml tags and text
  // example: (this), (<b>), (is a), (</b>), (test .)
  vector<string> xmlTokens = TokenizeXml(line);

  // we need to store opened tags, until they are closed
  // tags are stored as tripled (tagname, startpos, contents)
  typedef pair< string, pair< size_t, string > > OpenedTag;
  vector< OpenedTag > tagStack; // stack that contains active opened tags

  string cleanLine; // return string (text without xml)
  size_t wordPos = 0; // position in sentence (in terms of number of words)

  // keep this handy for later
  const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
  const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();

  // loop through the tokens
  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
    // not a xml tag, but regular text (may contain many words)
    if(!isXmlTag(xmlTokens[xmlTokenPos])) {
      // add a space at boundary, if necessary
      if (cleanLine.size()>0 &&
          cleanLine[cleanLine.size() - 1] != ' ' &&
          xmlTokens[xmlTokenPos][0] != ' ') {
        cleanLine += " ";
      }
      cleanLine += xmlTokens[xmlTokenPos]; // add to output
      wordPos = Tokenize(cleanLine).size(); // count all the words
    }

    // process xml tag
    else {
      // *** get essential information about tag ***

      // strip extra boundary spaces and "<" and ">"
      string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
      VERBOSE(3,"XML TAG IS: " << tag << std::endl);

      if (tag.size() == 0) {
        TRACE_ERR("ERROR: empty tag name: " << line << endl);
        return false;
      }

      // check if unary (e.g., "<wall/>")
      bool isUnary = ( tag[tag.size() - 1] == '/' );

      // check if opening tag (e.g. "<a>", not "</a>")g
      bool isClosed = ( tag[0] == '/' );
      bool isOpen = !isClosed;

      if (isClosed && isUnary) {
        TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
        return false;
      }

      if (isClosed)
        tag = tag.substr(1); // remove "/" at the beginning
      if (isUnary)
        tag = tag.substr(0,tag.size()-1); // remove "/" at the end

      // find the tag name and contents
      string::size_type endOfName = tag.find_first_of(' ');
      string tagName = tag;
      string tagContent = "";
      if (endOfName != string::npos) {
        tagName = tag.substr(0,endOfName);
        tagContent = tag.substr(endOfName+1);
      }

      // *** process new tag ***

      if (isOpen || isUnary) {
        // put the tag on the tag stack
        OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
        tagStack.push_back( openedTag );
        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
      }

      // *** process completed tag ***

      if (isClosed || isUnary) {
        // pop last opened tag from stack;
        if (tagStack.size() == 0) {
          TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
          return false;
        }
        OpenedTag openedTag = tagStack.back();
        tagStack.pop_back();

        // tag names have to match
        if (openedTag.first != tagName) {
          TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
          return false;
        }

        // assemble remaining information about tag
        size_t startPos = openedTag.second.first;
        string tagContent = openedTag.second.second;
        size_t endPos = wordPos;

        // span attribute overwrites position
        string span = ParseXmlTagAttribute(tagContent,"span");
        if (! span.empty()) {
          vector<string> ij = Tokenize(span, "-");
          if (ij.size() != 1 && ij.size() != 2) {
            TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
            return false;
          }
          startPos = atoi(ij[0].c_str());
          if (ij.size() == 1) endPos = startPos + 1;
          else endPos = atoi(ij[1].c_str()) + 1;
        }

        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);

        if (startPos >= endPos) {
          TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
          return false;
        }

	// may be either a input span label ("label"), or a specified output translation "translation"
        string label = ParseXmlTagAttribute(tagContent,"label");
        string translation = ParseXmlTagAttribute(tagContent,"translation");

        // specified label
        if (translation.length() == 0 && label.length() > 0) {
          WordsRange range(startPos,endPos-1); // really?
          XMLParseOutput item(label, range);
          sourceLabels.push_back(item);
        }

        // specified translations -> vector of phrases, separated by "||"
        if (translation.length() > 0 && StaticData::Instance().GetXmlInputType() != XmlIgnore) {
          vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||");
          vector<string> altLabel = TokenizeMultiCharSeparator(label, "||");
          vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
	  //TRACE_ERR("number of translations: " << altTexts.size() << endl);
          for (size_t i=0; i<altTexts.size(); ++i) {
            // set target phrase
            TargetPhrase targetPhrase(Output);
            targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],factorDelimiter);

            // set constituent label
	    string targetLHSstr;
            if (altLabel.size() > i && altLabel[i].size() > 0) {
              targetLHSstr = altLabel[i];
            }
            else {
              const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS();
              UnknownLHSList::const_iterator iterLHS = lhsList.begin();
              targetLHSstr = iterLHS->first;
            }
            Word targetLHS(true);
            targetLHS.CreateFromString(Output, outputFactorOrder, targetLHSstr, true);
            CHECK(targetLHS.GetFactor(0) != NULL);
            targetPhrase.SetTargetLHS(targetLHS);

            // get probability
            float probValue = 1;
            if (altProbs.size() > i && altProbs[i].size() > 0) {
              probValue = Scan<float>(altProbs[i]);
            }
            // convert from prob to log-prob
            float scoreValue = FloorScore(TransformScore(probValue));
            targetPhrase.SetScore(scoreValue);

            // set span and create XmlOption
            WordsRange range(startPos+1,endPos);
            XmlOption *option = new XmlOption(range,targetPhrase);
            CHECK(option);
            xmlOptions.push_back(option);

            VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl);
          }
          altTexts.clear();
          altProbs.clear();
        }
      }
    }
  }
  // we are done. check if there are tags that are still open
  if (tagStack.size() > 0) {
    TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
    return false;
  }

  // return de-xml'ed sentence in line
  line = cleanLine;
  return true;
}
Ejemplo n.º 23
0
void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to)
{
  // unknown word, add as trans opt
  const StaticData &staticData = StaticData::Instance();
  const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance();

  size_t isDigit = 0;
  if (staticData.GetDropUnknown()) {
    const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
    const StringPiece s = f->GetString();
    isDigit = s.find_first_of("0123456789");
    if (isDigit == string::npos)
      isDigit = 0;
    else
      isDigit = 1;
    // modify the starting bitmap
  }

  Phrase* unksrc = new Phrase(1);
  unksrc->AddWord() = sourceWord;
  Word &newWord = unksrc->GetWord(0);
  newWord.SetIsOOV(true);

  m_unksrcs.push_back(unksrc);

  //TranslationOption *transOpt;
  if (! staticData.GetDropUnknown() || isDigit) {
    // loop
    const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
    UnknownLHSList::const_iterator iterLHS;
    for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
      const string &targetLHSStr = iterLHS->first;
      float prob = iterLHS->second;

      // lhs
      //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal();
      Word *targetLHS = new Word(true);

      targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
      UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");

      // add to dictionary
      TargetPhrase *targetPhrase = new TargetPhrase();
      Word &targetWord = targetPhrase->AddWord();
      targetWord.CreateUnknownWord(sourceWord);

      // scores
      float unknownScore = FloorScore(TransformScore(prob));

      targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
      targetPhrase->Evaluate(*unksrc);

      targetPhrase->SetTargetLHS(targetLHS);
      targetPhrase->SetAlignmentInfo("0-0");
      if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
        targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]");
      }

      // chart rule
      to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
    } // for (iterLHS
  } else {
    // drop source word. create blank trans opt
    float unknownScore = FloorScore(-numeric_limits<float>::infinity());

    TargetPhrase *targetPhrase = new TargetPhrase();
    // loop
    const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
    UnknownLHSList::const_iterator iterLHS;
    for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
      const string &targetLHSStr = iterLHS->first;
      //float prob = iterLHS->second;

      Word *targetLHS = new Word(true);
      targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
      UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");

      targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
      targetPhrase->Evaluate(*unksrc);

      targetPhrase->SetTargetLHS(targetLHS);

      // chart rule
      to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
    }
  }
}
Ejemplo n.º 24
0
float LanguageModelInternal::GetValue(const Factor *factor0, const Factor *factor1, const Factor *factor2, State* finalState) const
{
	float score;
	const NGramNode *nGram[3];

	nGram[2]		= GetLmID(factor2);
	if (nGram[2] == NULL)
	{
		if (finalState != NULL)
			*finalState = NULL;
		score = -numeric_limits<float>::infinity();
	}
	else
	{
		nGram[1] = nGram[2]->GetNGram(factor1);
		if (nGram[1] == NULL)
		{ // something unigram
			if (finalState != NULL)
				*finalState = static_cast<const void*>(nGram[2]);
			
			nGram[1]	= GetLmID(factor1);
			if (nGram[1] == NULL)
			{ // stops at unigram
				score = nGram[2]->GetScore();
			}
			else
			{
				nGram[0] = nGram[1]->GetNGram(factor0);
				if (nGram[0] == NULL)
				{ // unigram unigram
					score = nGram[2]->GetScore() + nGram[1]->GetLogBackOff();
				}
				else
				{ // unigram bigram
					score = nGram[2]->GetScore() + nGram[1]->GetLogBackOff() + nGram[0]->GetLogBackOff();
				}	
			}			
		}
		else
		{ // trigram, or something bigram
			nGram[0] = nGram[1]->GetNGram(factor0);
			if (nGram[0] != NULL)
			{ // trigram
				if (finalState != NULL)
					*finalState = static_cast<const void*>(nGram[0]);
				score = nGram[0]->GetScore();
			}
			else
			{
				if (finalState != NULL)
					*finalState = static_cast<const void*>(nGram[1]);
				
				score			= nGram[1]->GetScore();
				nGram[1]	= nGram[1]->GetRootNGram();
				nGram[0]	= nGram[1]->GetNGram(factor0);
				if (nGram[0] == NULL)
				{ // just bigram
					// do nothing
				}
				else
				{
					score	+= nGram[0]->GetLogBackOff();
				}

			}
			// else do nothing. just use 1st bigram
		}
	}

	return FloorScore(score);

}