Exemplo n.º 1
0
bool HyperTreeLoader::Load(AllOptions const& opts,
                           const std::vector<FactorType> &input,
                           const std::vector<FactorType> &output,
                           const std::string &inFile,
                           const RuleTableFF &ff,
                           HyperTree &trie,
                           boost::unordered_set<std::size_t> &sourceTermSet)
{
  PrintUserTime(std::string("Start loading HyperTree"));

  sourceTermSet.clear();

  std::size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  std::vector<float> scoreVector;
  StringPiece line;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  HyperPathLoader hyperPathLoader;

  Phrase dummySourcePhrase;
  {
    Word *lhs = NULL;
    dummySourcePhrase.CreateFromString(Input, input, "hello", &lhs);
    delete lhs;
  }

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourceString(*pipes);
    StringPiece targetString(*++pipes);
    StringPiece scoreString(*++pipes);

    StringPiece alignString;
    if (++pipes) {
      StringPiece temp(*pipes);
      alignString = temp;
    }

    ++pipes;  // counts

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      int processed;
      float score = converter.StringToFloat(s->data(), s->length(), &processed);
      UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
      scoreVector.push_back(FloorScore(TransformScore(score)));
    }
    const std::size_t numScoreComponents = ff.GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                  << numScoreComponents << ") of score components on line " << count);
    }

    // Source-side
    HyperPath sourceFragment;
    hyperPathLoader.Load(sourceString, sourceFragment);
    ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet);

    // Target-side
    TargetPhrase *targetPhrase = new TargetPhrase(&ff);
    Word *targetLHS = NULL;
    targetPhrase->CreateFromString(Output, output, targetString, &targetLHS);
    targetPhrase->SetTargetLHS(targetLHS);
    targetPhrase->SetAlignmentInfo(alignString);

    if (++pipes) {
      StringPiece sparseString(*pipes);
      targetPhrase->SetSparseScore(&ff, sparseString);
    }

    if (++pipes) {
      StringPiece propertiesString(*pipes);
      targetPhrase->SetProperties(propertiesString);
    }

    targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
    targetPhrase->EvaluateInIsolation(dummySourcePhrase,
                                      ff.GetFeaturesToApply());

    // Add rule to trie.
    TargetPhraseCollection::shared_ptr phraseColl
    = GetOrCreateTargetPhraseCollection(trie, sourceFragment);
    phraseColl->Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  if (ff.GetTableLimit()) {
    SortAndPrune(trie, ff.GetTableLimit());
  }

  return true;
}
bool RuleTrieLoader::Load(const std::vector<FactorType> &input,
                          const std::vector<FactorType> &output,
                          const std::string &inFile,
                          const RuleTableFF &ff,
                          RuleTrie &trie)
{
    PrintUserTime(std::string("Start loading text phrase table. Moses format"));

    const StaticData &staticData = StaticData::Instance();
    // const std::string &factorDelimiter = staticData.GetFactorDelimiter();

    std::size_t count = 0;

    std::ostream *progress = NULL;
    IFVERBOSE(1) progress = &std::cerr;
    util::FilePiece in(inFile.c_str(), progress);

    // reused variables
    std::vector<float> scoreVector;
    StringPiece line;

    double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

    while(true) {
        try {
            line = in.ReadLine();
        } catch (const util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, "|||");
        StringPiece sourcePhraseString(*pipes);
        StringPiece targetPhraseString(*++pipes);
        StringPiece scoreString(*++pipes);

        StringPiece alignString;
        if (++pipes) {
            StringPiece temp(*pipes);
            alignString = temp;
        }

        if (++pipes) {
            StringPiece str(*pipes); //counts
        }

        bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos);
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
            continue;
        }

        scoreVector.clear();
        for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
            int processed;
            float score = converter.StringToFloat(s->data(), s->length(), &processed);
            UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
            scoreVector.push_back(FloorScore(TransformScore(score)));
        }
        const std::size_t numScoreComponents = ff.GetNumScoreComponents();
        if (scoreVector.size() != numScoreComponents) {
            UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                        << numScoreComponents << ") of score components on line " << count);
        }

        // parse source & find pt node

        // constituent labels
        Word *sourceLHS = NULL;
        Word *targetLHS;

        // create target phrase obj
        TargetPhrase *targetPhrase = new TargetPhrase(&ff);
        // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);
        targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
        // source
        Phrase sourcePhrase;
        // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);
        sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);

        // rest of target phrase
        targetPhrase->SetAlignmentInfo(alignString);
        targetPhrase->SetTargetLHS(targetLHS);

        //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

        if (++pipes) {
            StringPiece sparseString(*pipes);
            targetPhrase->SetSparseScore(&ff, sparseString);
        }

        if (++pipes) {
            StringPiece propertiesString(*pipes);
            targetPhrase->SetProperties(propertiesString);
        }

        targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
        targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply());

        TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(
                trie, *sourceLHS, sourcePhrase);
        phraseColl.Add(targetPhrase);

        // not implemented correctly in memory pt. just delete it for now
        delete sourceLHS;

        count++;
    }

    // sort and prune each target phrase collection
    if (ff.GetTableLimit()) {
        SortAndPrune(trie, ff.GetTableLimit());
    }

    return true;
}