Exemple #1
0
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
                           , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
                           , OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget)
{
  // retSourceTarget: 0 = don't return anything. 1 = source, 2 = target

  bool nonTerm = false;
  size_t tokSize = token.size();
  int comStr =token.compare(0, 1, "[");

  if (comStr == 0) {
    comStr = token.compare(tokSize - 1, 1, "]");
    nonTerm = comStr == 0;
  }

  OnDiskPt::WordPtr out;
  if (nonTerm) {
    // non-term
    size_t splitPos		= token.find_first_of("[", 2);
    string wordStr	= token.substr(0, splitPos);

    if (splitPos == string::npos) {
      // lhs - only 1 word
      WordPtr word(new Word());
      word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
      phrase.AddWord(word);
    } else {
      // source & target non-terms
      if (addSourceNonTerm) {
        WordPtr word(new Word());
        word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
        phrase.AddWord(word);

        if (retSourceTarget == 1) {
            out = word;
        }
      }

      wordStr = token.substr(splitPos, tokSize - splitPos);
      if (addTargetNonTerm) {
        WordPtr word(new Word());
        word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
        phrase.AddWord(word);

        if (retSourceTarget == 2) {
            out = word;
        }
      }

    }
  } else {
    // term
    WordPtr word(new Word());
    word->CreateFromString(token, onDiskWrapper.GetVocab());
    phrase.AddWord(word);
    out = word;
  }

  return out;
}
void PhraseDictionaryOnDisk::InitializeForInput(InputType const& source)
{
  const StaticData &staticData = StaticData::Instance();

  ReduceCache();

  OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper();
  obj->BeginLoad(m_filePath);

  UTIL_THROW_IF2(obj->GetMisc("Version") != OnDiskPt::OnDiskWrapper::VERSION_NUM,
		  "On-disk phrase table is version " <<  obj->GetMisc("Version")
		  << ". It is not compatible with version " << OnDiskPt::OnDiskWrapper::VERSION_NUM);

  UTIL_THROW_IF2(obj->GetMisc("NumSourceFactors") != m_input.size(),
		  "On-disk phrase table has " <<  obj->GetMisc("NumSourceFactors") << " source factors."
		  		  << ". The ini file specified " << m_input.size() << " source factors");

  UTIL_THROW_IF2(obj->GetMisc("NumTargetFactors") != m_output.size(),
		  "On-disk phrase table has " <<  obj->GetMisc("NumTargetFactors") << " target factors."
		  		  << ". The ini file specified " << m_output.size() << " target factors");

  UTIL_THROW_IF2(obj->GetMisc("NumScores") != m_numScoreComponents,
		  "On-disk phrase table has " <<  obj->GetMisc("NumScores") << " scores."
		  		  << ". The ini file specified " << m_numScoreComponents << " scores");

  m_implementation.reset(obj);
}
void Tokenize(OnDiskPt::Phrase &phrase
							, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
							, OnDiskPt::OnDiskWrapper &onDiskWrapper)
{
	
	bool nonTerm = false;
	size_t tokSize = token.size();
	int comStr =token.compare(0, 1, "[");
	
	if (comStr == 0)
	{
		comStr = token.compare(tokSize - 1, 1, "]");
		nonTerm = comStr == 0;
	}
	
	if (nonTerm)
	{ // non-term
		size_t splitPos		= token.find_first_of("[", 2);
		string wordStr	= token.substr(0, splitPos);

		if (splitPos == string::npos)
		{ // lhs - only 1 word
			Word *word = new Word();
			word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
			phrase.AddWord(word);
		}
		else
		{ // source & target non-terms
			if (addSourceNonTerm)
			{
				Word *word = new Word();
				word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
				phrase.AddWord(word);
			}
			
			wordStr = token.substr(splitPos, tokSize - splitPos);
			if (addTargetNonTerm)
			{
				Word *word = new Word();
				word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
				phrase.AddWord(word);
			}
			
		}
	}
	else
	{ // term
		Word *word = new Word();
		word->CreateFromString(token, onDiskWrapper.GetVocab());
		phrase.AddWord(word);
	}	
}
void PhraseDictionaryOnDisk::InitializeForInput(InputType const& source)
{
  const StaticData &staticData = StaticData::Instance();

  OnDiskPt::OnDiskWrapper *obj = new OnDiskPt::OnDiskWrapper();
  if (!obj->BeginLoad(m_filePath))
    return;

  CHECK(obj->GetMisc("Version") == OnDiskPt::OnDiskWrapper::VERSION_NUM);
  CHECK(obj->GetMisc("NumSourceFactors") == m_input.size());
  CHECK(obj->GetMisc("NumTargetFactors") == m_output.size());
  CHECK(obj->GetMisc("NumScores") == m_numScoreComponents);

  m_implementation.reset(obj);
}