Beispiel #1
0
OnDiskPt::WordPtr Tokenize(OnDiskPt::Phrase &phrase
                           , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
                           , OnDiskPt::OnDiskWrapper &onDiskWrapper, int retSourceTarget)
{
  // retSourceTarget: 0 = don't return anything. 1 = source, 2 = target

  bool nonTerm = false;
  size_t tokSize = token.size();
  int comStr =token.compare(0, 1, "[");

  if (comStr == 0) {
    comStr = token.compare(tokSize - 1, 1, "]");
    nonTerm = comStr == 0;
  }

  OnDiskPt::WordPtr out;
  if (nonTerm) {
    // non-term
    size_t splitPos		= token.find_first_of("[", 2);
    string wordStr	= token.substr(0, splitPos);

    if (splitPos == string::npos) {
      // lhs - only 1 word
      WordPtr word(new Word());
      word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
      phrase.AddWord(word);
    } else {
      // source & target non-terms
      if (addSourceNonTerm) {
        WordPtr word(new Word());
        word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
        phrase.AddWord(word);

        if (retSourceTarget == 1) {
            out = word;
        }
      }

      wordStr = token.substr(splitPos, tokSize - splitPos);
      if (addTargetNonTerm) {
        WordPtr word(new Word());
        word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
        phrase.AddWord(word);

        if (retSourceTarget == 2) {
            out = word;
        }
      }

    }
  } else {
    // term
    WordPtr word(new Word());
    word->CreateFromString(token, onDiskWrapper.GetVocab());
    phrase.AddWord(word);
    out = word;
  }

  return out;
}
Beispiel #2
0
void Tokenize(OnDiskPt::Phrase &phrase
							, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
							, OnDiskPt::OnDiskWrapper &onDiskWrapper)
{
	
	bool nonTerm = false;
	size_t tokSize = token.size();
	int comStr =token.compare(0, 1, "[");
	
	if (comStr == 0)
	{
		comStr = token.compare(tokSize - 1, 1, "]");
		nonTerm = comStr == 0;
	}
	
	if (nonTerm)
	{ // non-term
		size_t splitPos		= token.find_first_of("[", 2);
		string wordStr	= token.substr(0, splitPos);

		if (splitPos == string::npos)
		{ // lhs - only 1 word
			Word *word = new Word();
			word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
			phrase.AddWord(word);
		}
		else
		{ // source & target non-terms
			if (addSourceNonTerm)
			{
				Word *word = new Word();
				word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
				phrase.AddWord(word);
			}
			
			wordStr = token.substr(splitPos, tokSize - splitPos);
			if (addTargetNonTerm)
			{
				Word *word = new Word();
				word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
				phrase.AddWord(word);
			}
			
		}
	}
	else
	{ // term
		Word *word = new Word();
		word->CreateFromString(token, onDiskWrapper.GetVocab());
		phrase.AddWord(word);
	}	
}