OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, const std::string &lineStr, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) { char line[lineStr.size() + 1]; strcpy(line, lineStr.c_str()); stringstream sparseFeatures, property; size_t scoreInd = 0; // MAIN LOOP size_t stage = 0; /* 0 = source phrase 1 = target phrase 2 = scores 3 = align 4 = count 7 = properties */ char *tok = strtok (line," "); OnDiskPt::PhrasePtr out(new Phrase()); while (tok != NULL) { if (0 == strcmp(tok, "|||")) { ++stage; } else { switch (stage) { case 0: { WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1); if (w != NULL) out->AddWord(w); break; } case 1: { Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0); break; } case 2: { float score = Moses::Scan<float>(tok); targetPhrase.SetScore(score, scoreInd); ++scoreInd; break; } case 3: { //targetPhrase.Create1AlignFromString(tok); targetPhrase.CreateAlignFromString(tok); break; } case 4: { // store only the 3rd one (rule count) float val = Moses::Scan<float>(tok); misc[0] = val; break; } case 5: { // sparse features sparseFeatures << tok << " "; break; } case 6: { property << tok << " "; break; } default: cerr << "ERROR in line " << line << endl; assert(false); break; } } tok = strtok (NULL, " "); } // while (tok != NULL) assert(scoreInd == numScores); targetPhrase.SetSparseFeatures(Moses::Trim(sparseFeatures.str())); targetPhrase.SetProperty(Moses::Trim(property.str())); targetPhrase.SortAlign(); return out; } // Tokenize()
OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) { size_t scoreInd = 0; // MAIN LOOP size_t stage = 0; /* 0 = source phrase 1 = target phrase 2 = scores 3 = align 4 = count */ char *tok = strtok (line," "); OnDiskPt::PhrasePtr out(new Phrase()); while (tok != NULL) { if (0 == strcmp(tok, "|||")) { ++stage; } else { switch (stage) { case 0: { WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1); if (w != NULL) out->AddWord(w); break; } case 1: { Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0); break; } case 2: { float score = Moses::Scan<float>(tok); targetPhrase.SetScore(score, scoreInd); ++scoreInd; break; } case 3: { //targetPhrase.Create1AlignFromString(tok); targetPhrase.CreateAlignFromString(tok); break; } case 4: ++stage; break; /* case 5: { // count info. Only store the 2nd one float val = Moses::Scan<float>(tok); misc[0] = val; ++stage; break; }*/ case 5: { // count info. Only store the 2nd one //float val = Moses::Scan<float>(tok); //misc[0] = val; ++stage; break; } case 6: { // store only the 3rd one (rule count) float val = Moses::Scan<float>(tok); misc[0] = val; ++stage; break; } default: cerr << "ERROR in line " << line << endl; assert(false); break; } } tok = strtok (NULL, " "); } // while (tok != NULL) assert(scoreInd == numScores); targetPhrase.SortAlign(); return out; } // Tokenize()
void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) { size_t scoreInd = 0; // MAIN LOOP size_t stage = 0; /* 0 = source phrase 1 = target phrase 2 = scores 3 = align 4 = count */ char *tok = strtok (line," "); while (tok != NULL) { if (0 == strcmp(tok, "|||")) { ++stage; } else { switch (stage) { case 0: { Tokenize(sourcePhrase, tok, true, true, onDiskWrapper); break; } case 1: { Tokenize(targetPhrase, tok, false, true, onDiskWrapper); break; } case 2: { float score = Moses::Scan<float>(tok); targetPhrase.SetScore(score, scoreInd); ++scoreInd; break; } case 3: { targetPhrase.Create1AlignFromString(tok); break; } case 4: ++stage; break; case 5: { // count info. Only store the 2nd one float val = Moses::Scan<float>(tok); misc[0] = val; ++stage; break; } default: assert(false); break; } } tok = strtok (NULL, " "); } // while (tok != NULL) assert(scoreInd == numScores); targetPhrase.SortAlign(); } // Tokenize()