OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, const std::string &lineStr, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) { char line[lineStr.size() + 1]; strcpy(line, lineStr.c_str()); stringstream sparseFeatures, property; size_t scoreInd = 0; // MAIN LOOP size_t stage = 0; /* 0 = source phrase 1 = target phrase 2 = scores 3 = align 4 = count 7 = properties */ char *tok = strtok (line," "); OnDiskPt::PhrasePtr out(new Phrase()); while (tok != NULL) { if (0 == strcmp(tok, "|||")) { ++stage; } else { switch (stage) { case 0: { WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1); if (w != NULL) out->AddWord(w); break; } case 1: { Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0); break; } case 2: { float score = Moses::Scan<float>(tok); targetPhrase.SetScore(score, scoreInd); ++scoreInd; break; } case 3: { //targetPhrase.Create1AlignFromString(tok); targetPhrase.CreateAlignFromString(tok); break; } case 4: { // store only the 3rd one (rule count) float val = Moses::Scan<float>(tok); misc[0] = val; break; } case 5: { // sparse features sparseFeatures << tok << " "; break; } case 6: { property << tok << " "; break; } default: cerr << "ERROR in line " << line << endl; assert(false); break; } } tok = strtok (NULL, " "); } // while (tok != NULL) assert(scoreInd == numScores); targetPhrase.SetSparseFeatures(Moses::Trim(sparseFeatures.str())); targetPhrase.SetProperty(Moses::Trim(property.str())); targetPhrase.SortAlign(); return out; } // Tokenize()
void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to) { // unknown word, add as trans opt const StaticData &staticData = StaticData::Instance(); const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); size_t isDigit = 0; if (staticData.GetDropUnknown()) { const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface const StringPiece s = f->GetString(); isDigit = s.find_first_of("0123456789"); if (isDigit == string::npos) isDigit = 0; else isDigit = 1; // modify the starting bitmap } Phrase* unksrc = new Phrase(1); unksrc->AddWord() = sourceWord; Word &newWord = unksrc->GetWord(0); newWord.SetIsOOV(true); m_unksrcs.push_back(unksrc); //TranslationOption *transOpt; if (! staticData.GetDropUnknown() || isDigit) { // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; float prob = iterLHS->second; // lhs //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal(); Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); // add to dictionary TargetPhrase *targetPhrase = new TargetPhrase(); Word &targetWord = targetPhrase->AddWord(); targetWord.CreateUnknownWord(sourceWord); // scores float unknownScore = FloorScore(TransformScore(prob)); targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo("0-0"); if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) { targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]"); } // chart rule to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range); } // for (iterLHS } else { // drop source word. create blank trans opt float unknownScore = FloorScore(-numeric_limits<float>::infinity()); TargetPhrase *targetPhrase = new TargetPhrase(); // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; //float prob = iterLHS->second; Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); targetPhrase->SetTargetLHS(targetLHS); // chart rule to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range); } } }