OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm , OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget) { // retSourceTarget: 0 = don't return anything. 1 = source, 2 = target bool nonTerm = false; size_t tokSize = token.size(); int comStr =token.compare(0, 1, "["); if (comStr == 0) { comStr = token.compare(tokSize - 1, 1, "]"); nonTerm = comStr == 0; } OnDiskPt::WordPtr out; if (nonTerm) { // non-term size_t splitPos = token.find_first_of("[", 2); string wordStr = token.substr(0, splitPos); if (splitPos == string::npos) { // lhs - only 1 word WordPtr word(new Word()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } else { // source & target non-terms if (addSourceNonTerm) { WordPtr word(new Word()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); if (retSourceTarget == 1) { out = word; } } wordStr = token.substr(splitPos, tokSize - splitPos); if (addTargetNonTerm) { WordPtr word(new Word()); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); if (retSourceTarget == 2) { out = word; } } } } else { // term WordPtr word(new Word()); word->CreateFromString(token, onDiskWrapper.GetVocab()); phrase.AddWord(word); out = word; } return out; }
void PhraseDictionaryOnDisk::InitializeForInput(InputType const& source) { const StaticData &staticData = StaticData::Instance(); ReduceCache(); OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper(); obj->BeginLoad(m_filePath); UTIL_THROW_IF2(obj->GetMisc("Version") != OnDiskPt::OnDiskWrapper::VERSION_NUM, "On-disk phrase table is version " << obj->GetMisc("Version") << ". It is not compatible with version " << OnDiskPt::OnDiskWrapper::VERSION_NUM); UTIL_THROW_IF2(obj->GetMisc("NumSourceFactors") != m_input.size(), "On-disk phrase table has " << obj->GetMisc("NumSourceFactors") << " source factors." << ". The ini file specified " << m_input.size() << " source factors"); UTIL_THROW_IF2(obj->GetMisc("NumTargetFactors") != m_output.size(), "On-disk phrase table has " << obj->GetMisc("NumTargetFactors") << " target factors." << ". The ini file specified " << m_output.size() << " target factors"); UTIL_THROW_IF2(obj->GetMisc("NumScores") != m_numScoreComponents, "On-disk phrase table has " << obj->GetMisc("NumScores") << " scores." << ". The ini file specified " << m_numScoreComponents << " scores"); m_implementation.reset(obj); }
void Tokenize(OnDiskPt::Phrase &phrase , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm , OnDiskPt::OnDiskWrapper &onDiskWrapper) { bool nonTerm = false; size_t tokSize = token.size(); int comStr =token.compare(0, 1, "["); if (comStr == 0) { comStr = token.compare(tokSize - 1, 1, "]"); nonTerm = comStr == 0; } if (nonTerm) { // non-term size_t splitPos = token.find_first_of("[", 2); string wordStr = token.substr(0, splitPos); if (splitPos == string::npos) { // lhs - only 1 word Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } else { // source & target non-terms if (addSourceNonTerm) { Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } wordStr = token.substr(splitPos, tokSize - splitPos); if (addTargetNonTerm) { Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } } } else { // term Word *word = new Word(); word->CreateFromString(token, onDiskWrapper.GetVocab()); phrase.AddWord(word); } }
void PhraseDictionaryOnDisk::InitializeForInput(InputType const& source) { const StaticData &staticData = StaticData::Instance(); OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper(); if (!obj->BeginLoad(m_filePath)) return; CHECK(obj->GetMisc("Version") == OnDiskPt::OnDiskWrapper::VERSION_NUM); CHECK(obj->GetMisc("NumSourceFactors") == m_input.size()); CHECK(obj->GetMisc("NumTargetFactors") == m_output.size()); CHECK(obj->GetMisc("NumScores") == m_numScoreComponents); m_implementation.reset(obj); }