OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm , OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget) { // retSourceTarget: 0 = don't return anything. 1 = source, 2 = target bool nonTerm = false; size_t tokSize = token.size(); int comStr =token.compare(0, 1, "["); if (comStr == 0) { comStr = token.compare(tokSize - 1, 1, "]"); nonTerm = comStr == 0; } OnDiskPt::WordPtr out; if (nonTerm) { // non-term size_t splitPos = token.find_first_of("[", 2); string wordStr = token.substr(0, splitPos); if (splitPos == string::npos) { // lhs - only 1 word WordPtr word(new Word()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } else { // source & target non-terms if (addSourceNonTerm) { WordPtr word(new Word()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); if (retSourceTarget == 1) { out = word; } } wordStr = token.substr(splitPos, tokSize - splitPos); if (addTargetNonTerm) { WordPtr word(new Word()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); if (retSourceTarget == 2) { out = word; } } } } else { // term WordPtr word(new Word()); word->CreateFromString(token, onDiskWrapper.GetVocab()); phrase.AddWord(word); out = word; } return out; }
void Tokenize(OnDiskPt::Phrase &phrase , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm , OnDiskPt::OnDiskWrapper &onDiskWrapper) { bool nonTerm = false; size_t tokSize = token.size(); int comStr =token.compare(0, 1, "["); if (comStr == 0) { comStr = token.compare(tokSize - 1, 1, "]"); nonTerm = comStr == 0; } if (nonTerm) { // non-term size_t splitPos = token.find_first_of("[", 2); string wordStr = token.substr(0, splitPos); if (splitPos == string::npos) { // lhs - only 1 word Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } else { // source & target non-terms if (addSourceNonTerm) { Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } wordStr = token.substr(splitPos, tokSize - splitPos); if (addTargetNonTerm) { Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } } } else { // term Word *word = new Word(); word->CreateFromString(token, onDiskWrapper.GetVocab()); phrase.AddWord(word); } }