Exemple #1
0
bool RuleTableLoaderStandard::Load(FormatType format
                                , const std::vector<FactorType> &input
                                , const std::vector<FactorType> &output
                                , const std::string &inFile
                                , const std::vector<float> &weight
                                , size_t /* tableLimit */
                                , const LMList &languageModels
                                , const WordPenaltyProducer* wpProducer
                                , RuleTableTrie &ruleTable)
{
  PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();

  string lineOrig;
  size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  vector<float> scoreVector;
  StringPiece line;
  std::string hiero_before, hiero_after;

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) { break; }

    if (format == HieroFormat) { // inefficiently reformat line
      hiero_before.assign(line.data(), line.size());
      ReformatHieroRule(hiero_before, hiero_after);
      line = hiero_after;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourcePhraseString(*pipes);
    StringPiece targetPhraseString(*++pipes);
    StringPiece scoreString(*++pipes);
    StringPiece alignString(*++pipes);
    // TODO(bhaddow) efficiently handle default instead of parsing this string every time.  
    StringPiece ruleCountString = ++pipes ? *pipes : StringPiece("1 1");
    
    if (++pipes) {
      stringstream strme;
      strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count;
      UserMessage::Add(strme.str());
      abort();
    }

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      char *err_ind;
      scoreVector.push_back(strtod(s->data(), &err_ind));
      UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count);
    }
    const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }

    // parse source & find pt node

    // constituent labels
    Word sourceLHS, targetLHS;

    // source
    Phrase sourcePhrase( 0);
    sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS);

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase(Output);
    targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS);
    targetPhrase->SetSourcePhrase(sourcePhrase);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString, sourcePhrase);
    targetPhrase->SetTargetLHS(targetLHS);
    
    targetPhrase->SetRuleCount(ruleCountString, scoreVector[0]);
    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
    
    // component score, for n-best output
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore);
    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);

    targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer);

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  SortAndPrune(ruleTable);

  return true;
}
bool RuleTrieLoader::Load(const std::vector<FactorType> &input,
                          const std::vector<FactorType> &output,
                          const std::string &inFile,
                          const RuleTableFF &ff,
                          RuleTrie &trie)
{
    PrintUserTime(std::string("Start loading text phrase table. Moses format"));

    const StaticData &staticData = StaticData::Instance();
    // const std::string &factorDelimiter = staticData.GetFactorDelimiter();

    std::size_t count = 0;

    std::ostream *progress = NULL;
    IFVERBOSE(1) progress = &std::cerr;
    util::FilePiece in(inFile.c_str(), progress);

    // reused variables
    std::vector<float> scoreVector;
    StringPiece line;

    double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

    while(true) {
        try {
            line = in.ReadLine();
        } catch (const util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, "|||");
        StringPiece sourcePhraseString(*pipes);
        StringPiece targetPhraseString(*++pipes);
        StringPiece scoreString(*++pipes);

        StringPiece alignString;
        if (++pipes) {
            StringPiece temp(*pipes);
            alignString = temp;
        }

        if (++pipes) {
            StringPiece str(*pipes); //counts
        }

        bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos);
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
            continue;
        }

        scoreVector.clear();
        for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
            int processed;
            float score = converter.StringToFloat(s->data(), s->length(), &processed);
            UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count);
            scoreVector.push_back(FloorScore(TransformScore(score)));
        }
        const std::size_t numScoreComponents = ff.GetNumScoreComponents();
        if (scoreVector.size() != numScoreComponents) {
            UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!="
                        << numScoreComponents << ") of score components on line " << count);
        }

        // parse source & find pt node

        // constituent labels
        Word *sourceLHS = NULL;
        Word *targetLHS;

        // create target phrase obj
        TargetPhrase *targetPhrase = new TargetPhrase(&ff);
        // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);
        targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS);
        // source
        Phrase sourcePhrase;
        // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);
        sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS);

        // rest of target phrase
        targetPhrase->SetAlignmentInfo(alignString);
        targetPhrase->SetTargetLHS(targetLHS);

        //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

        if (++pipes) {
            StringPiece sparseString(*pipes);
            targetPhrase->SetSparseScore(&ff, sparseString);
        }

        if (++pipes) {
            StringPiece propertiesString(*pipes);
            targetPhrase->SetProperties(propertiesString);
        }

        targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector);
        targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply());

        TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(
                trie, *sourceLHS, sourcePhrase);
        phraseColl.Add(targetPhrase);

        // not implemented correctly in memory pt. just delete it for now
        delete sourceLHS;

        count++;
    }

    // sort and prune each target phrase collection
    if (ff.GetTableLimit()) {
        SortAndPrune(trie, ff.GetTableLimit());
    }

    return true;
}
bool RuleTableLoaderStandard::Load(FormatType format
                                   , const std::vector<FactorType> &input
                                   , const std::vector<FactorType> &output
                                   , const std::string &inFile
                                   , size_t /* tableLimit */
                                   , RuleTableTrie &ruleTable)
{
  PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format");

  const StaticData &staticData = StaticData::Instance();
  const std::string& factorDelimiter = staticData.GetFactorDelimiter();

  string lineOrig;
  size_t count = 0;

  std::ostream *progress = NULL;
  IFVERBOSE(1) progress = &std::cerr;
  util::FilePiece in(inFile.c_str(), progress);

  // reused variables
  vector<float> scoreVector;
  StringPiece line;
  std::string hiero_before, hiero_after;

  double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan");

  while(true) {
    try {
      line = in.ReadLine();
    } catch (const util::EndOfFileException &e) {
      break;
    }

    if (format == HieroFormat) { // inefficiently reformat line
      hiero_before.assign(line.data(), line.size());
      ReformatHieroRule(hiero_before, hiero_after);
      line = hiero_after;
    }

    util::TokenIter<util::MultiCharacter> pipes(line, "|||");
    StringPiece sourcePhraseString(*pipes);
    StringPiece targetPhraseString(*++pipes);
    StringPiece scoreString(*++pipes);

    StringPiece alignString;
    if (++pipes) {
      StringPiece temp(*pipes);
      alignString = temp;
    }

    if (++pipes) {
      StringPiece str(*pipes); //counts
    }

    bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
    if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
      TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n");
      continue;
    }

    scoreVector.clear();
    for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) {
      int processed;
      float score = converter.StringToFloat(s->data(), s->length(), &processed);
      UTIL_THROW_IF(isnan(score), util::Exception, "Bad score " << *s << " on line " << count);
      scoreVector.push_back(FloorScore(TransformScore(score)));
    }
    const size_t numScoreComponents = ruleTable.GetNumScoreComponents();
    if (scoreVector.size() != numScoreComponents) {
      stringstream strme;
      strme << "Size of scoreVector != number (" << scoreVector.size() << "!="
            << numScoreComponents << ") of score components on line " << count;
      UserMessage::Add(strme.str());
      abort();
    }

    // parse source & find pt node

    // constituent labels
    Word *sourceLHS;
    Word *targetLHS;

    // create target phrase obj
    TargetPhrase *targetPhrase = new TargetPhrase();
    targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS);

    // source
    Phrase sourcePhrase;
    sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS);

    // rest of target phrase
    targetPhrase->SetAlignmentInfo(alignString);
    targetPhrase->SetTargetLHS(targetLHS);

    //targetPhrase->SetDebugOutput(string("New Format pt ") + line);

    if (++pipes) {
      StringPiece sparseString(*pipes);
      targetPhrase->SetSparseScore(&ruleTable, sparseString);
    }

    if (++pipes) {
      StringPiece propertiesString(*pipes);
      targetPhrase->SetProperties(propertiesString);
    }

    targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector);
    targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply());

    TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS);
    phraseColl.Add(targetPhrase);

    count++;
  }

  // sort and prune each target phrase collection
  SortAndPrune(ruleTable);

  return true;
}
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input
                                  , const std::vector<FactorType> &output
                                  , const string &filePath
                                  , const vector<float> &weight
                                  , size_t tableLimit
                                  , const LMList &languageModels
                                  , float weightWP)
{
    const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing();

    const StaticData &staticData = StaticData::Instance();

    m_tableLimit = tableLimit;

    util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL);

    size_t line_num = 0;
    size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
    const std::string& factorDelimiter = staticData.GetFactorDelimiter();

    Phrase sourcePhrase(0);
    std::vector<float> scv;
    scv.reserve(m_numScoreComponent);

    TargetPhraseCollection *preSourceNode = NULL;
    std::string preSourceString;

    while(true) {
        ++line_num;
        StringPiece line;
        try {
            line = inFile.ReadLine();
        } catch (util::EndOfFileException &e) {
            break;
        }

        util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||"));
        StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num));
        StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num));
        StringPiece scoreString(GrabOrDie(pipes, filePath, line_num));

        bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t"));
        if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
            TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n");
            continue;
        }

        //target
        std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase());
        targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter);

        scv.clear();
        for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) {
            char *err_ind;
            // Token is always delimited by some form of space.  Also, apparently strtod is portable but strtof isn't.
            scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind)))));
            if (err_ind == token->data()) {
                stringstream strme;
                strme << "Bad number " << token << " on line " << line_num;
                UserMessage::Add(strme.str());
                abort();
            }
        }
        if (scv.size() != m_numScoreComponent) {
            stringstream strme;
            strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num;
            UserMessage::Add(strme.str());
            abort();
        }



        size_t consumed = 3;
        if (pipes) {
            targetPhrase->SetAlignmentInfo(*pipes++);
            ++consumed;
        }

        ScoreComponentCollection sparse;
        if (pipes) pipes++; //counts
        if (pipes) {
            //sparse features
            SparsePhraseDictionaryFeature* spdf =
                GetFeature()->GetSparsePhraseDictionaryFeature();
            if (spdf) {
                sparse.Assign(spdf,(pipes++)->as_string());
            }
        }


        // scv good to go sir!
        targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels);

        // Check number of entries delimited by ||| agrees across all lines.
        for (; pipes; ++pipes, ++consumed) {}
        if (numElement != consumed) {
            if (numElement == NOT_FOUND) {
                numElement = consumed;
            } else {
                stringstream strme;
                strme << "Syntax error at " << filePath << ":" << line_num;
                UserMessage::Add(strme.str());
                abort();
            }
        }

        //TODO: Would be better to reuse source phrases, but ownership has to be
        //consistent across phrase table implementations
        sourcePhrase.Clear();
        sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter);
        //Now that the source phrase is ready, we give the target phrase a copy
        targetPhrase->SetSourcePhrase(sourcePhrase);
        if (preSourceString == sourcePhraseString && preSourceNode) {
            preSourceNode->Add(targetPhrase.release());
        } else {
            preSourceNode = CreateTargetPhraseCollection(sourcePhrase);
            preSourceNode->Add(targetPhrase.release());
            preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size());
        }
    }

    // sort each target phrase collection
    m_collection.Sort(m_tableLimit);

    /* // TODO ASK OLIVER WHY THIS IS NEEDED
    const_cast<LMList&>(languageModels).CleanUpAfterSentenceProcessing();
    */

    return true;
}