コード例 #1
0
ファイル: Main.cpp プロジェクト: Deseaus/mosesdecoder
OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, const std::string &lineStr, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc)
{
  char line[lineStr.size() + 1];
  strcpy(line, lineStr.c_str());

  stringstream sparseFeatures, property;

  size_t scoreInd = 0;

  // MAIN LOOP
  size_t stage = 0;
  /*	0 = source phrase
   1 = target phrase
   2 = scores
   3 = align
   4 = count
   7 = properties
   */
  char *tok = strtok (line," ");
  OnDiskPt::PhrasePtr out(new Phrase());
  while (tok != NULL) {
    if (0 == strcmp(tok, "|||")) {
      ++stage;
    } else {
      switch (stage) {
      case 0: {
        WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1);
        if (w != NULL)
          out->AddWord(w);

        break;
      }
      case 1: {
        Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0);
        break;
      }
      case 2: {
        float score = Moses::Scan<float>(tok);
        targetPhrase.SetScore(score, scoreInd);
        ++scoreInd;
        break;
      }
      case 3: {
        //targetPhrase.Create1AlignFromString(tok);
        targetPhrase.CreateAlignFromString(tok);
        break;
      }
      case 4: {
        // store only the 3rd one (rule count)
        float val = Moses::Scan<float>(tok);
        misc[0] = val;
        break;
      }
      case 5: {
        // sparse features
        sparseFeatures << tok << " ";
        break;
      }
      case 6: {
        property << tok << " ";
        break;
      }
      default:
        cerr << "ERROR in line " << line << endl;
        assert(false);
        break;
      }
    }

    tok = strtok (NULL, " ");
  } // while (tok != NULL)

  assert(scoreInd == numScores);
  targetPhrase.SetSparseFeatures(Moses::Trim(sparseFeatures.str()));
  targetPhrase.SetProperty(Moses::Trim(property.str()));
  targetPhrase.SortAlign();
  return out;
} // Tokenize()
コード例 #2
0
void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to)
{
  // unknown word, add as trans opt
  const StaticData &staticData = StaticData::Instance();
  const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance();

  size_t isDigit = 0;
  if (staticData.GetDropUnknown()) {
    const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
    const StringPiece s = f->GetString();
    isDigit = s.find_first_of("0123456789");
    if (isDigit == string::npos)
      isDigit = 0;
    else
      isDigit = 1;
    // modify the starting bitmap
  }

  Phrase* unksrc = new Phrase(1);
  unksrc->AddWord() = sourceWord;
  Word &newWord = unksrc->GetWord(0);
  newWord.SetIsOOV(true);

  m_unksrcs.push_back(unksrc);

  //TranslationOption *transOpt;
  if (! staticData.GetDropUnknown() || isDigit) {
    // loop
    const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
    UnknownLHSList::const_iterator iterLHS;
    for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
      const string &targetLHSStr = iterLHS->first;
      float prob = iterLHS->second;

      // lhs
      //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal();
      Word *targetLHS = new Word(true);

      targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
      UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");

      // add to dictionary
      TargetPhrase *targetPhrase = new TargetPhrase();
      Word &targetWord = targetPhrase->AddWord();
      targetWord.CreateUnknownWord(sourceWord);

      // scores
      float unknownScore = FloorScore(TransformScore(prob));

      targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
      targetPhrase->Evaluate(*unksrc);

      targetPhrase->SetTargetLHS(targetLHS);
      targetPhrase->SetAlignmentInfo("0-0");
      if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
        targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]");
      }

      // chart rule
      to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
    } // for (iterLHS
  } else {
    // drop source word. create blank trans opt
    float unknownScore = FloorScore(-numeric_limits<float>::infinity());

    TargetPhrase *targetPhrase = new TargetPhrase();
    // loop
    const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
    UnknownLHSList::const_iterator iterLHS;
    for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
      const string &targetLHSStr = iterLHS->first;
      //float prob = iterLHS->second;

      Word *targetLHS = new Word(true);
      targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
      UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");

      targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
      targetPhrase->Evaluate(*unksrc);

      targetPhrase->SetTargetLHS(targetLHS);

      // chart rule
      to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
    }
  }
}