Ejemplo n.º 1
0
//Find top n translations of source, and send them to output
static void outputTopN(const StringPiece& sourcePhraseString, PhraseDictionary* phraseTable, const std::vector<FactorType> &input,  ostream& out) {
  //get list of target phrases
  Phrase sourcePhrase;
  sourcePhrase.CreateFromString(Input,input,sourcePhraseString,NULL);
  InputPath inputPath(sourcePhrase, NonTerminalSet(), WordsRange(0,sourcePhrase.GetSize()-1),NULL,NULL);
  InputPathList inputPaths;
  inputPaths.push_back(&inputPath);
  phraseTable->GetTargetPhraseCollectionBatch(inputPaths);
  const TargetPhraseCollection* targetPhrases = inputPath.GetTargetPhrases(*phraseTable);




  //print phrases
  const std::vector<FactorType>& output = StaticData::Instance().GetOutputFactorOrder();
  if (targetPhrases) {
    //if (targetPhrases->GetSize() > 10) cerr << "src " << sourcePhrase << " tgt count " << targetPhrases->GetSize() << endl;
    for (TargetPhraseCollection::const_iterator i = targetPhrases->begin(); i != targetPhrases->end(); ++i) {
      const TargetPhrase* targetPhrase = *i;
      out << sourcePhrase.GetStringRep(input);
      out << " ||| ";
      out << targetPhrase->GetStringRep(output);
      out << " ||| ";
      const ScoreComponentCollection scores = targetPhrase->GetScoreBreakdown();
      vector<float> phraseScores = scores.GetScoresForProducer(phraseTable);
      for (size_t j = 0; j < phraseScores.size(); ++j) {
        out << exp(phraseScores[j]) << " ";
      }
      out << "||| ";
      const AlignmentInfo& align = targetPhrase->GetAlignTerm();
      for (AlignmentInfo::const_iterator j = align.begin(); j != align.end(); ++j) {
        out << j->first << "-" << j->second << " ";
      }
      out << endl;
    }
  }

}
Ejemplo n.º 2
0
bool RuleTableLoaderStandard::Load(FormatType format
                                   , const std::vector<FactorType> &input
                                   , const std::vector<FactorType> &output
                                   , const std::string &inFile
                                   , size_t /* tableLimit */
                                   , RuleTableTrie &ruleTable)
{
  PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();

  string lineOrig;
  size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  vector<float> scoreVector;
  StringPiece line;
  std::string hiero_before, hiero_after;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    if (format == HieroFormat) { // inefficiently reformat line
      hiero_before.assign(line.data(), line.size());
      ReformatHieroRule(hiero_before, hiero_after);
      line = hiero_after;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourcePhraseString(*pipes);
    StringPiece targetPhraseString(*++pipes);
    StringPiece scoreString(*++pipes);

    StringPiece alignString;
    if (++pipes) {
      StringPiece temp(*pipes);
      alignString = temp;
    }

    if (++pipes) {
      StringPiece str(*pipes); //counts
    }

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      int processed;
      float score = converter.StringToFloat(s->data(), s->length(), &processed);
      UTIL_THROW_IF(isnan(score), util::Exception, "Bad score " << *s << " on line " << count);
      scoreVector.push_back(FloorScore(TransformScore(score)));
    }
    const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }

    // parse source & find pt node

    // constituent labels
    Word *sourceLHS;
    Word *targetLHS;

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase();
    targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);

    // source
    Phrase sourcePhrase;
    sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString);
    targetPhrase->SetTargetLHS(targetLHS);

    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

    if (++pipes) {
      StringPiece sparseString(*pipes);
      targetPhrase->SetSparseScore(&ruleTable, sparseString);
    }

    if (++pipes) {
      StringPiece propertiesString(*pipes);
      targetPhrase->SetProperties(propertiesString);
    }

    targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
    targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  SortAndPrune(ruleTable);

  return true;
}
Ejemplo n.º 3
0
bool HyperTreeLoader::Load(AllOptions const& opts,
                           const std::vector<FactorType> &input,
                           const std::vector<FactorType> &output,
                           const std::string &inFile,
                           const RuleTableFF &ff,
                           HyperTree &trie,
                           boost::unordered_set<std::size_t> &sourceTermSet)
{
  PrintUserTime(std::string("Start loading HyperTree"));

  sourceTermSet.clear();

  std::size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  std::vector<float> scoreVector;
  StringPiece line;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  HyperPathLoader hyperPathLoader;

  Phrase dummySourcePhrase;
  {
    Word *lhs = NULL;
    dummySourcePhrase.CreateFromString(Input, input, "hello", &lhs);
    delete lhs;
  }

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourceString(*pipes);
    StringPiece targetString(*++pipes);
    StringPiece scoreString(*++pipes);

    StringPiece alignString;
    if (++pipes) {
      StringPiece temp(*pipes);
      alignString = temp;
    }

    ++pipes;  // counts

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      int processed;
      float score = converter.StringToFloat(s->data(), s->length(), &processed);
      UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
      scoreVector.push_back(FloorScore(TransformScore(score)));
    }
    const std::size_t numScoreComponents = ff.GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                  << numScoreComponents << ") of score components on line " << count);
    }

    // Source-side
    HyperPath sourceFragment;
    hyperPathLoader.Load(sourceString, sourceFragment);
    ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet);

    // Target-side
    TargetPhrase *targetPhrase = new TargetPhrase(&ff);
    Word *targetLHS = NULL;
    targetPhrase->CreateFromString(Output, output, targetString, &targetLHS);
    targetPhrase->SetTargetLHS(targetLHS);
    targetPhrase->SetAlignmentInfo(alignString);

    if (++pipes) {
      StringPiece sparseString(*pipes);
      targetPhrase->SetSparseScore(&ff, sparseString);
    }

    if (++pipes) {
      StringPiece propertiesString(*pipes);
      targetPhrase->SetProperties(propertiesString);
    }

    targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
    targetPhrase->EvaluateInIsolation(dummySourcePhrase,
                                      ff.GetFeaturesToApply());

    // Add rule to trie.
    TargetPhraseCollection::shared_ptr phraseColl
    = GetOrCreateTargetPhraseCollection(trie, sourceFragment);
    phraseColl->Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  if (ff.GetTableLimit()) {
    SortAndPrune(trie, ff.GetTableLimit());
  }

  return true;
}
bool RuleTrieLoader::Load(const std::vector<FactorType> &input,
                          const std::vector<FactorType> &output,
                          const std::string &inFile,
                          const RuleTableFF &ff,
                          RuleTrie &trie)
{
    PrintUserTime(std::string("Start loading text phrase table. Moses format"));

    const StaticData &staticData = StaticData::Instance();
    // const std::string &factorDelimiter = staticData.GetFactorDelimiter();

    std::size_t count = 0;

    std::ostream *progress = NULL;
    IFVERBOSE(1) progress = &std::cerr;
    util::FilePiece in(inFile.c_str(), progress);

    // reused variables
    std::vector<float> scoreVector;
    StringPiece line;

    double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

    while(true) {
        try {
            line = in.ReadLine();
        } catch (const util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, "|||");
        StringPiece sourcePhraseString(*pipes);
        StringPiece targetPhraseString(*++pipes);
        StringPiece scoreString(*++pipes);

        StringPiece alignString;
        if (++pipes) {
            StringPiece temp(*pipes);
            alignString = temp;
        }

        if (++pipes) {
            StringPiece str(*pipes); //counts
        }

        bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos);
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
            continue;
        }

        scoreVector.clear();
        for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
            int processed;
            float score = converter.StringToFloat(s->data(), s->length(), &processed);
            UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
            scoreVector.push_back(FloorScore(TransformScore(score)));
        }
        const std::size_t numScoreComponents = ff.GetNumScoreComponents();
        if (scoreVector.size() != numScoreComponents) {
            UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                        << numScoreComponents << ") of score components on line " << count);
        }

        // parse source & find pt node

        // constituent labels
        Word *sourceLHS = NULL;
        Word *targetLHS;

        // create target phrase obj
        TargetPhrase *targetPhrase = new TargetPhrase(&ff);
        // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);
        targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
        // source
        Phrase sourcePhrase;
        // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);
        sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);

        // rest of target phrase
        targetPhrase->SetAlignmentInfo(alignString);
        targetPhrase->SetTargetLHS(targetLHS);

        //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

        if (++pipes) {
            StringPiece sparseString(*pipes);
            targetPhrase->SetSparseScore(&ff, sparseString);
        }

        if (++pipes) {
            StringPiece propertiesString(*pipes);
            targetPhrase->SetProperties(propertiesString);
        }

        targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
        targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply());

        TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(
                trie, *sourceLHS, sourcePhrase);
        phraseColl.Add(targetPhrase);

        // not implemented correctly in memory pt. just delete it for now
        delete sourceLHS;

        count++;
    }

    // sort and prune each target phrase collection
    if (ff.GetTableLimit()) {
        SortAndPrune(trie, ff.GetTableLimit());
    }

    return true;
}
int main(int argc, char **argv)
{
  int nscores = 5;
  std::string ttable = "";
  bool useAlignments = false;
  bool reportCounts = false;

  for(int i = 1; i < argc; i++) {
    if(!strcmp(argv[i], "-n")) {
      if(i + 1 == argc)
        usage();
      nscores = atoi(argv[++i]);
    } else if(!strcmp(argv[i], "-t")) {
      if(i + 1 == argc)
        usage();
      ttable = argv[++i];
    } else if(!strcmp(argv[i], "-a")) {
      useAlignments = true;
    } else if (!strcmp(argv[i], "-c")) {
      reportCounts = true;
    }
    else
      usage();
  }

  if(ttable == "")
    usage();

  std::vector<FactorType> input(1, 0);
  std::vector<FactorType> output(1, 0);
  std::vector<float> weight(nscores, 0);
  
  LMList lmList;
  
  Parameter *parameter = new Parameter();
  const_cast<std::vector<std::string>&>(parameter->GetParam("factor-delimiter")).resize(1, "||dummy_string||");
  const_cast<std::vector<std::string>&>(parameter->GetParam("input-factors")).resize(1, "0");
  const_cast<std::vector<std::string>&>(parameter->GetParam("verbose")).resize(1, "0");
  const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0");
  const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0");
  
  StaticData::InstanceNonConst().LoadData(parameter);

  SparsePhraseDictionaryFeature *spdf = NULL;
  PhraseDictionaryFeature pdf(Compact, spdf, nscores, nscores, input, output, ttable, weight, 0, 0, "", "");
  PhraseDictionaryCompact pdc(nscores, Compact, &pdf, false, useAlignments);
  bool ret = pdc.Load(input, output, ttable, weight, 0, lmList, 0);                                                                           
  assert(ret);
  
  std::string line;
  while(getline(std::cin, line)) {
    Phrase sourcePhrase;
    sourcePhrase.CreateFromString(input, line, "||dummy_string||");
    
    TargetPhraseVectorPtr decodedPhraseColl
      = pdc.GetTargetPhraseCollectionRaw(sourcePhrase);
    
    if(decodedPhraseColl != NULL) {
      if(reportCounts)
        std::cout << sourcePhrase << decodedPhraseColl->size() << std::endl;
      else
        for(TargetPhraseVector::iterator it = decodedPhraseColl->begin(); it != decodedPhraseColl->end(); it++) {
          TargetPhrase &tp = *it;
          std::cout << sourcePhrase << "||| ";
          std::cout << static_cast<const Phrase&>(tp) << "|||";
          
          if(useAlignments)
            std::cout << " " << tp.GetAlignTerm() << "|||"; 
          
          std::vector<float> scores = tp.GetScoreBreakdown().GetScoresForProducer(&pdf);
          for(size_t i = 0; i < scores.size(); i++)
            std::cout << " " << exp(scores[i]);
          std::cout << std::endl;
        }
    }
    else if(reportCounts)
      std::cout << sourcePhrase << 0 << std::endl;
    
    std::cout.flush();
  }
}
Ejemplo n.º 6
0
    void testCalcScore() {

      double p_the      = -1.383059;
      double p_licenses = -2.360783;
      double p_for      = -1.661813;
      double p_most     = -2.360783;
      //      double p_software = -1.62042;

      double p_the_licenses  = -0.9625873;
      double p_licenses_for  = -1.661557;
      double p_for_most      = -0.4526253;
      //      double p_most_software = -1.70295; 

      double p_the_licenses_for  = p_the_licenses + p_licenses_for;
      //      double p_licenses_for_most = p_licenses_for + p_for_most;
 
      // the
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"the", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 1 );
      
	float fullScore;
	float ngramScore;
	size_t oovCount;
	backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);

	BOOST_CHECK( oovCount == 0 );
	SLOPPY_CHECK_CLOSE( TransformLMScore(p_the), fullScore, 0.01);
	SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
      }

      // the licenses
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"the licenses", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 2 );
      
	float fullScore;
	float ngramScore;
	size_t oovCount;
	backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);

	BOOST_CHECK( oovCount == 0 );
	SLOPPY_CHECK_CLOSE( TransformLMScore(p_licenses + p_the_licenses), fullScore, 0.01);
	SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
      }
      
      // the licenses for
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"the licenses for", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 3 );
      
	float fullScore;
	float ngramScore;
	size_t oovCount;
	backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);

	BOOST_CHECK( oovCount == 0 );
	SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses_for ), ngramScore, 0.01);
	SLOPPY_CHECK_CLOSE( TransformLMScore(p_for + p_licenses_for + p_the_licenses), fullScore, 0.01);
      }
     
      // the licenses for most
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"the licenses for most", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 4 );
      
	float fullScore;
	float ngramScore;
	size_t oovCount;
	backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);

	BOOST_CHECK( oovCount == 0 );
	SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses + p_licenses_for ), ngramScore, 0.01);
	SLOPPY_CHECK_CLOSE( TransformLMScore(p_most + p_for_most + p_licenses_for + p_the_licenses), fullScore, 0.01);
      }
 
    }
Ejemplo n.º 7
0
  void testEvaluate() {

    FFState *nextState;
    FFState *prevState = const_cast< FFState * >(backwardLM->EmptyHypothesisState( *dummyInput ));

    double p_most     = -2.360783;
    double p_for      = -1.661813;
    double p_licenses = -2.360783;
    double p_the      = -1.383059;
    double p_eos      = -1.457693;

    double p_most_for      = -0.4526253;
    double p_for_licenses  = -1.661557;
    double p_licenses_the = -0.9625873;
    double p_the_eos = -1.940311;
    

      // the
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"the", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 1 );
      
	float score;
	nextState = backwardLM->Evaluate(phrase, prevState, score);

	// p(the) * p(</s> | the) / p(</s>)
	SLOPPY_CHECK_CLOSE( (p_the + p_the_eos - p_eos), score, 0.01);
	
	delete prevState;
	prevState = nextState;

      }

      // the licenses
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"licenses", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 1 );
      
	float score;
	nextState = backwardLM->Evaluate(phrase, prevState, score);

	// p(licenses) * p(licenses | the) / p(the)
	SLOPPY_CHECK_CLOSE( (p_licenses + p_licenses_the - p_the), score, 0.01);
	
	delete prevState;
	prevState = nextState;

      }

      // the licenses for
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"for", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 1 );
      
	float score;
	nextState = backwardLM->Evaluate(phrase, prevState, score);

	// p(for) * p(for | licenses) / p(licenses)
	SLOPPY_CHECK_CLOSE( (p_for + p_for_licenses - p_licenses), score, 0.01);
	
	delete prevState;
	prevState = nextState;

      }

      // the licenses for most
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"most", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 1 );
      
	float score;
	nextState = backwardLM->Evaluate(phrase, prevState, score);

	// p(most) * p(most | for) / p(for)
	SLOPPY_CHECK_CLOSE( (p_most + p_most_for - p_for), score, 0.01);
	
	delete prevState;
	prevState = nextState;

      }

      delete prevState;
  }