Beispiel #1
0
double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, vector<float> &multimodelweights, bool is_input) const
{
  // lexical translation probability

  double lexScore = 1.0;
  Word null;
  if (is_input) {
    null.CreateFromString(Input, m_input, "NULL", false);
  } else {
    null.CreateFromString(Output, m_output, "NULL", false);
  }

  // all target words have to be explained
  for(size_t ti=0; ti<alignment.size(); ti++) {
    const set< size_t > & srcIndices = alignment[ ti ];
    Word t_word = phraseT.GetWord(ti);

    if (srcIndices.empty()) {
      // explain unaligned word by NULL
      lexScore *= GetLexicalProbability( null, t_word, tables, multimodelweights );
    } else {
      // go through all the aligned words to compute average
      double thisWordScore = 0;
      for (set< size_t >::const_iterator si(srcIndices.begin()); si != srcIndices.end(); ++si) {
        Word s_word = phraseS.GetWord(*si);
        thisWordScore += GetLexicalProbability( s_word, t_word, tables, multimodelweights );
      }
      lexScore *= thisWordScore / srcIndices.size();
    }
  }
  return lexScore;
}
Beispiel #2
0
Syntax::F2S::Forest::Vertex *ForestInput::ParseVertex(
  const StringPiece &s, const std::vector<FactorType>& factorOrder)
{
  using Syntax::F2S::Forest;

  Word symbol;
  std::size_t pos = s.rfind('[');
  if (pos == std::string::npos) {
    symbol.CreateFromString(Input, factorOrder, s, false);
    // Create vertex: caller will fill in span.
    Range span(0, 0);
    return new Forest::Vertex(Syntax::PVertex(span, symbol));
  }
  symbol.CreateFromString(Input, factorOrder, s.substr(0, pos), true);
  std::size_t begin = pos + 1;
  pos = s.find(',', begin+1);
  std::string tmp;
  s.substr(begin, pos-begin).CopyToString(&tmp);
  std::size_t start = std::atoi(tmp.c_str());
  s.substr(pos+1, s.size()-pos-2).CopyToString(&tmp);
  std::size_t end = std::atoi(tmp.c_str());
  // Create vertex: offset span by 1 to allow for <s> in first position.
  Range span(start+1, end+1);
  return new Forest::Vertex(Syntax::PVertex(span, symbol));
}
Beispiel #3
0
void Tokenize(OnDiskPt::Phrase &phrase
							, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm
							, OnDiskPt::OnDiskWrapper &onDiskWrapper)
{
	
	bool nonTerm = false;
	size_t tokSize = token.size();
	int comStr =token.compare(0, 1, "[");
	
	if (comStr == 0)
	{
		comStr = token.compare(tokSize - 1, 1, "]");
		nonTerm = comStr == 0;
	}
	
	if (nonTerm)
	{ // non-term
		size_t splitPos		= token.find_first_of("[", 2);
		string wordStr	= token.substr(0, splitPos);

		if (splitPos == string::npos)
		{ // lhs - only 1 word
			Word *word = new Word();
			word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
			phrase.AddWord(word);
		}
		else
		{ // source & target non-terms
			if (addSourceNonTerm)
			{
				Word *word = new Word();
				word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
				phrase.AddWord(word);
			}
			
			wordStr = token.substr(splitPos, tokSize - splitPos);
			if (addTargetNonTerm)
			{
				Word *word = new Word();
				word->CreateFromString(wordStr, onDiskWrapper.GetVocab());
				phrase.AddWord(word);
			}
			
		}
	}
	else
	{ // term
		Word *word = new Word();
		word->CreateFromString(token, onDiskWrapper.GetVocab());
		phrase.AddWord(word);
	}	
}
void UnknownWordPenalty::Lookup(const std::vector<InputPath*> &inputPathQueue)
{
  Scores *estimatedFutureScore = new Scores();

  for (size_t i = 0; i < inputPathQueue.size(); ++i) {
    InputPath &path = *inputPathQueue[i];
    PhraseTableLookup &ptLookup = path.GetPtLookup(m_ptId);

    const Phrase &source = path.GetPhrase();
    if (source.GetSize() == 1) {
      const Word &sourceWord = source.GetWord(0);
      string str = sourceWord.ToString();
      str = "UNK:" + str + ":UNK";

      Word targetWord;
      targetWord.CreateFromString(str);

      TargetPhrase *tp = new TargetPhrase(1);
      tp->Set(0, targetWord);
      tp->GetScores().Add(*this, LOWEST_SCORE);

      FeatureFunction::Evaluate(source, *tp, *estimatedFutureScore);

      TargetPhrases *tpColl = new TargetPhrases();
      m_targetPhrases.push_back(tpColl);
      tpColl->Add(tp);

      ptLookup.Set(tpColl, NULL);
    } else {
      ptLookup.Set(NULL, NULL);
    }
  }
}
Beispiel #5
0
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction,
                 const FactorList& factors, bool closed)
{
  // load vocab id -> word mapping
  m_words2ids.clear();	// reset mapping
  m_ids2words.clear();
  std::string line, word_str;
  wordID_t id;

  std::istream &ret = getline(*vcbin, line);
  UTIL_THROW_IF2(!ret, "Couldn't read file");
  std::istringstream first(line.c_str());
  uint32_t vcbsize(0);
  first >> vcbsize;
  uint32_t loadedsize = 0;
  while (loadedsize++ < vcbsize && getline(*vcbin, line)) {
    std::istringstream entry(line.c_str());
    entry >> word_str;
    Word word;
    word.CreateFromString( direction, factors, word_str, false); // TODO set correctly isNonTerminal
    entry >> id;
    // may be no id (i.e. file may just be a word list)
    if (id == 0 && word != GetkOOVWord())
      id = m_ids2words.size() + 1;	// assign ids sequentially starting from 1
    UTIL_THROW_IF2(m_ids2words.count(id) != 0 || m_words2ids.count(word) != 0,
                   "Error");

    m_ids2words[id] = word;
    m_words2ids[word] = id;
  }
  m_closed = closed;	// once loaded fix vocab ?
  std::cerr << "Loaded vocab with " << m_ids2words.size() << " words." << std::endl;
  return true;
}
Beispiel #6
0
	wordID_t Vocab::GetWordID(const std::string& word_str) {
		FactorList factors;
		factors.push_back(0); 
		Word word;
		word.CreateFromString(Input, factors, word_str, false); 
    return GetWordID(word);
  }
Beispiel #7
0
// get wordID_t index for word represented as string
wordID_t Vocab::GetWordID(const std::string& word_str,
                          const FactorDirection& direction, const FactorList& factors, bool isNonTerminal)
{
  // get id for factored string
  Word word;
  word.CreateFromString( direction, factors, word_str, isNonTerminal);
  return GetWordID( word);
}
Beispiel #8
0
lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, bool is_input )
{
//do all the necessary lexical table lookups and get counts, but don't apply weights yet

  Word null;
  if (is_input) {
    null.CreateFromString(Input, m_input, "NULL", false);
  } else {
    null.CreateFromString(Output, m_output, "NULL", false);
  }

  lexicalCache ret;

  // all target words have to be explained
  for(size_t ti=0; ti<alignment.size(); ti++) {
    const set< size_t > & srcIndices = alignment[ ti ];
    Word t_word = phraseT.GetWord(ti);

    vector<lexicalPair> ti_vector;
    if (srcIndices.empty()) {
      // explain unaligned word by NULL
      vector<float> joint_count (m_numModels);
      vector<float> marginals (m_numModels);

      FillLexicalCountsJoint(null, t_word, joint_count, tables);
      FillLexicalCountsMarginal(null, marginals, tables);

      ti_vector.push_back(make_pair(joint_count, marginals));

    } else {
      for (set< size_t >::const_iterator si(srcIndices.begin()); si != srcIndices.end(); ++si) {
        Word s_word = phraseS.GetWord(*si);
        vector<float> joint_count (m_numModels);
        vector<float> marginals (m_numModels);

        FillLexicalCountsJoint(s_word, t_word, joint_count, tables);
        FillLexicalCountsMarginal(s_word, marginals, tables);

        ti_vector.push_back(make_pair(joint_count, marginals));
      }
    }
    ret.push_back(ti_vector);
  }
  return ret;
}
Beispiel #9
0
void Phrase::CreateFromStringNewFormat(FactorDirection direction
																			 , const std::vector<FactorType> &factorOrder
																			 , const std::string &phraseString
																			 , const std::string &factorDelimiter
																			 , Word &lhs)
{
	m_arity = 0;
	
	// parse
	vector<string> annotatedWordVector;
	Tokenize(annotatedWordVector, phraseString);
	// KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none 
	//		to
	// "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none"
	
	for (size_t phrasePos = 0 ; phrasePos < annotatedWordVector.size() -  1 ; phrasePos++)
	{
		string &annotatedWord = annotatedWordVector[phrasePos];
		bool isNonTerminal;
		if (annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]")
		{ // non-term
			isNonTerminal = true;
			
			size_t nextPos = annotatedWord.find("[", 1);
			assert(nextPos != string::npos);
			
			if (direction == Input)
				annotatedWord = annotatedWord.substr(1, nextPos - 2);
			else
				annotatedWord = annotatedWord.substr(nextPos + 1, annotatedWord.size() - nextPos - 2);
			
			m_arity++;
		}
		else
		{
			isNonTerminal = false;
		}
		
		Word &word = AddWord();
		word.CreateFromString(direction, factorOrder, annotatedWord, isNonTerminal);		
		
	}
	
	// lhs
	string &annotatedWord = annotatedWordVector.back();
	assert(annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]");
	annotatedWord = annotatedWord.substr(1, annotatedWord.size() - 2);
	
	lhs.CreateFromString(direction, factorOrder, annotatedWord, true);		
	assert(lhs.IsNonTerminal());
}
Beispiel #10
0
const Word Vocab::InitSpecialWord( const std::string& word_str)
{
  FactorList factors;
  factors.push_back(0); // store the special word string as the first factor
  Word word;
  // define special word as Input word with one factor and isNonTerminal=false
  word.CreateFromString( Input, factors, word_str, false ); // Input is enum defined in ../typedef.h
  // TODO not sure if this will work properly:
  // 	- word comparison can fail because the last parameter (isNonTerminal)
  // 		in function CreateFromString may not match properly created words
  // 	- special word is Input word but what about Output words?
  // 		- currently Input/Output variable is not stored in class Word, but in the future???
  return word;
}
Beispiel #11
0
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
  TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream,
  const Phrase &sourcePhrase, bool topLevel)
{
  
  bool extending = tpv->size();
  size_t bitsLeft = encodedBitStream.TellFromEnd();
    
  typedef std::pair<size_t, size_t> AlignPointSizeT;
  
  std::vector<int> sourceWords;
  if(m_coding == REnc)
  {
    for(size_t i = 0; i < sourcePhrase.GetSize(); i++)
    {
      std::string sourceWord
        = sourcePhrase.GetWord(i).GetString(*m_input, false);
      unsigned idx = GetSourceSymbolId(sourceWord);
      sourceWords.push_back(idx);
    }
  }
  
  unsigned phraseStopSymbol = 0;
  AlignPoint alignStopSymbol(-1, -1);
  
  std::vector<float> scores;
  std::set<AlignPointSizeT> alignment;
  
  enum DecodeState { New, Symbol, Score, Alignment, Add } state = New;
  
  size_t srcSize = sourcePhrase.GetSize();
  
  TargetPhrase* targetPhrase = NULL;
  while(encodedBitStream.TellFromEnd())
  {
     
    if(state == New)
    {
      // Creating new TargetPhrase on the heap
      tpv->push_back(TargetPhrase(Output));
      targetPhrase = &tpv->back();
      
      targetPhrase->SetSourcePhrase(sourcePhrase);
      alignment.clear();
      scores.clear();
        
      state = Symbol;
    }
    
    if(state == Symbol)
    {
      unsigned symbol = m_symbolTree->Read(encodedBitStream);      
      if(symbol == phraseStopSymbol)
      {
        state = Score;
      }
      else
      {
        if(m_coding == REnc)
        {
          std::string wordString;
          size_t type = GetREncType(symbol);
          
          if(type == 1)
          {
            unsigned decodedSymbol = DecodeREncSymbol1(symbol);
            wordString = GetTargetSymbol(decodedSymbol);
          }
          else if (type == 2)
          {
            size_t rank = DecodeREncSymbol2Rank(symbol);
            size_t srcPos = DecodeREncSymbol2Position(symbol);
            
            if(srcPos >= sourceWords.size())
              return TargetPhraseVectorPtr();  
            
            wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
            if(m_phraseDictionary.m_useAlignmentInfo)
            {
              size_t trgPos = targetPhrase->GetSize();
              alignment.insert(AlignPoint(srcPos, trgPos));
            }
          }
          else if(type == 3)
          {
            size_t rank = DecodeREncSymbol3(symbol);
            size_t srcPos = targetPhrase->GetSize();
            
            if(srcPos >= sourceWords.size())
              return TargetPhraseVectorPtr();  
                            
            wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));   
            if(m_phraseDictionary.m_useAlignmentInfo)
            {
              size_t trgPos = srcPos;
              alignment.insert(AlignPoint(srcPos, trgPos));
            }
          }
          
          Word word;
          word.CreateFromString(Output, *m_output, wordString, false);
          targetPhrase->AddWord(word);
        }
        else if(m_coding == PREnc)
        {
          // if the symbol is just a word
          if(GetPREncType(symbol) == 1)
          {
            unsigned decodedSymbol = DecodePREncSymbol1(symbol);
     
            Word word;
            word.CreateFromString(Output, *m_output,
                                  GetTargetSymbol(decodedSymbol), false);
            targetPhrase->AddWord(word);
          }
          // if the symbol is a subphrase pointer
          else
          {
            int left = DecodePREncSymbol2Left(symbol);
            int right = DecodePREncSymbol2Right(symbol);
            unsigned rank = DecodePREncSymbol2Rank(symbol);
            
            int srcStart = left + targetPhrase->GetSize();
            int srcEnd   = srcSize - right - 1;
            
            // false positive consistency check
            if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize)
              return TargetPhraseVectorPtr();
            
            // false positive consistency check
            if(m_maxRank && rank > m_maxRank)
                return TargetPhraseVectorPtr();
            
            // set subphrase by default to itself
            TargetPhraseVectorPtr subTpv = tpv;
            
            // if range smaller than source phrase retrieve subphrase
            if(unsigned(srcEnd - srcStart + 1) != srcSize)
            {
              Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd));
              subTpv = CreateTargetPhraseCollection(subPhrase, false);
            }
            
            // false positive consistency check
            if(subTpv != NULL && rank < subTpv->size())
            {
              // insert the subphrase into the main target phrase
              TargetPhrase& subTp = subTpv->at(rank);
              if(m_phraseDictionary.m_useAlignmentInfo)
              {
                // reconstruct the alignment data based on the alignment of the subphrase
                for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin();
                    it != subTp.GetAlignmentInfo().end(); it++)
                {
                  alignment.insert(AlignPointSizeT(srcStart + it->first,
                                                   targetPhrase->GetSize() + it->second));
                }
              }
              targetPhrase->Append(subTp);
            }
            else 
              return TargetPhraseVectorPtr();
          }
        }
        else
        {
            Word word;
            word.CreateFromString(Output, *m_output,
                                  GetTargetSymbol(symbol), false);
            targetPhrase->AddWord(word);
        }
      }
    }
    else if(state == Score)
    {
      size_t idx = m_multipleScoreTrees ? scores.size() : 0;
      float score = m_scoreTrees[idx]->Read(encodedBitStream);
      scores.push_back(score);
      
      if(scores.size() == m_numScoreComponent)
      {
        targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels);
        
        if(m_containsAlignmentInfo)
          state = Alignment;
        else
          state = Add;
      }
    }
    else if(state == Alignment)
    {
      AlignPoint alignPoint = m_alignTree->Read(encodedBitStream);
      if(alignPoint == alignStopSymbol)
      {
        state = Add;
      }
      else
      {
        if(m_phraseDictionary.m_useAlignmentInfo)  
          alignment.insert(AlignPointSizeT(alignPoint));
      }
    }
    
    if(state == Add)
    {
      if(m_phraseDictionary.m_useAlignmentInfo)
        targetPhrase->SetAlignmentInfo(alignment);
      
      if(m_coding == PREnc)
      {
        if(!m_maxRank || tpv->size() <= m_maxRank)
          bitsLeft = encodedBitStream.TellFromEnd();
        
        if(!topLevel && m_maxRank && tpv->size() >= m_maxRank)
          break;
      }
      
      if(encodedBitStream.TellFromEnd() <= 8)
        break;
      
      state = New;
    }    
  }
  
  if(m_coding == PREnc && !extending)
  {
    bitsLeft = bitsLeft > 8 ? bitsLeft : 0;
    m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank);
  }
  
  return tpv;
}
Beispiel #12
0
void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to)
{
  // unknown word, add as trans opt
  const StaticData &staticData = StaticData::Instance();
  const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance();

  size_t isDigit = 0;
  if (staticData.GetDropUnknown()) {
    const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface
    const StringPiece s = f->GetString();
    isDigit = s.find_first_of("0123456789");
    if (isDigit == string::npos)
      isDigit = 0;
    else
      isDigit = 1;
    // modify the starting bitmap
  }

  Phrase* unksrc = new Phrase(1);
  unksrc->AddWord() = sourceWord;
  Word &newWord = unksrc->GetWord(0);
  newWord.SetIsOOV(true);

  m_unksrcs.push_back(unksrc);

  //TranslationOption *transOpt;
  if (! staticData.GetDropUnknown() || isDigit) {
    // loop
    const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
    UnknownLHSList::const_iterator iterLHS;
    for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
      const string &targetLHSStr = iterLHS->first;
      float prob = iterLHS->second;

      // lhs
      //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal();
      Word *targetLHS = new Word(true);

      targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
      UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");

      // add to dictionary
      TargetPhrase *targetPhrase = new TargetPhrase();
      Word &targetWord = targetPhrase->AddWord();
      targetWord.CreateUnknownWord(sourceWord);

      // scores
      float unknownScore = FloorScore(TransformScore(prob));

      targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
      targetPhrase->Evaluate(*unksrc);

      targetPhrase->SetTargetLHS(targetLHS);
      targetPhrase->SetAlignmentInfo("0-0");
      if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) {
        targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]");
      }

      // chart rule
      to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
    } // for (iterLHS
  } else {
    // drop source word. create blank trans opt
    float unknownScore = FloorScore(-numeric_limits<float>::infinity());

    TargetPhrase *targetPhrase = new TargetPhrase();
    // loop
    const UnknownLHSList &lhsList = staticData.GetUnknownLHS();
    UnknownLHSList::const_iterator iterLHS;
    for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) {
      const string &targetLHSStr = iterLHS->first;
      //float prob = iterLHS->second;

      Word *targetLHS = new Word(true);
      targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true);
      UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS");

      targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore);
      targetPhrase->Evaluate(*unksrc);

      targetPhrase->SetTargetLHS(targetLHS);

      // chart rule
      to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range);
    }
  }
}
Beispiel #13
0
/**
 * Process a sentence with xml annotation
 * Xml tags may specifiy additional/replacing translation options
 * and reordering constraints
 *
 * \param line in: sentence, out: sentence without the xml
 * \param res vector with translation options specified by xml
 * \param reorderingConstraint reordering constraint zones specified by xml
 * \param walls reordering constraint walls specified by xml
 */
bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels, std::vector<XmlOption*> &xmlOptions)
{
  //parse XML markup in translation line

  // no xml tag? we're done.
  if (line.find_first_of('<') == string::npos) {
    return true;
  }

  // break up input into a vector of xml tags and text
  // example: (this), (<b>), (is a), (</b>), (test .)
  vector<string> xmlTokens = TokenizeXml(line);

  // we need to store opened tags, until they are closed
  // tags are stored as tripled (tagname, startpos, contents)
  typedef pair< string, pair< size_t, string > > OpenedTag;
  vector< OpenedTag > tagStack; // stack that contains active opened tags

  string cleanLine; // return string (text without xml)
  size_t wordPos = 0; // position in sentence (in terms of number of words)

  // keep this handy for later
  const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
  const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter();

  // loop through the tokens
  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
    // not a xml tag, but regular text (may contain many words)
    if(!isXmlTag(xmlTokens[xmlTokenPos])) {
      // add a space at boundary, if necessary
      if (cleanLine.size()>0 &&
          cleanLine[cleanLine.size() - 1] != ' ' &&
          xmlTokens[xmlTokenPos][0] != ' ') {
        cleanLine += " ";
      }
      cleanLine += xmlTokens[xmlTokenPos]; // add to output
      wordPos = Tokenize(cleanLine).size(); // count all the words
    }

    // process xml tag
    else {
      // *** get essential information about tag ***

      // strip extra boundary spaces and "<" and ">"
      string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
      VERBOSE(3,"XML TAG IS: " << tag << std::endl);

      if (tag.size() == 0) {
        TRACE_ERR("ERROR: empty tag name: " << line << endl);
        return false;
      }

      // check if unary (e.g., "<wall/>")
      bool isUnary = ( tag[tag.size() - 1] == '/' );

      // check if opening tag (e.g. "<a>", not "</a>")g
      bool isClosed = ( tag[0] == '/' );
      bool isOpen = !isClosed;

      if (isClosed && isUnary) {
        TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl);
        return false;
      }

      if (isClosed)
        tag = tag.substr(1); // remove "/" at the beginning
      if (isUnary)
        tag = tag.substr(0,tag.size()-1); // remove "/" at the end

      // find the tag name and contents
      string::size_type endOfName = tag.find_first_of(' ');
      string tagName = tag;
      string tagContent = "";
      if (endOfName != string::npos) {
        tagName = tag.substr(0,endOfName);
        tagContent = tag.substr(endOfName+1);
      }

      // *** process new tag ***

      if (isOpen || isUnary) {
        // put the tag on the tag stack
        OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
        tagStack.push_back( openedTag );
        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl);
      }

      // *** process completed tag ***

      if (isClosed || isUnary) {
        // pop last opened tag from stack;
        if (tagStack.size() == 0) {
          TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl);
          return false;
        }
        OpenedTag openedTag = tagStack.back();
        tagStack.pop_back();

        // tag names have to match
        if (openedTag.first != tagName) {
          TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl );
          return false;
        }

        // assemble remaining information about tag
        size_t startPos = openedTag.second.first;
        string tagContent = openedTag.second.second;
        size_t endPos = wordPos;

        // span attribute overwrites position
        string span = ParseXmlTagAttribute(tagContent,"span");
        if (! span.empty()) {
          vector<string> ij = Tokenize(span, "-");
          if (ij.size() != 1 && ij.size() != 2) {
            TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl);
            return false;
          }
          startPos = atoi(ij[0].c_str());
          if (ij.size() == 1) endPos = startPos + 1;
          else endPos = atoi(ij[1].c_str()) + 1;
        }

        VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl);

        if (startPos >= endPos) {
          TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl);
          return false;
        }

        // may be either a input span label ("label"), or a specified output translation "translation"
        string label = ParseXmlTagAttribute(tagContent,"label");
        string translation = ParseXmlTagAttribute(tagContent,"translation");

        // specified label
        if (translation.length() == 0 && label.length() > 0) {
          WordsRange range(startPos,endPos-1); // really?
          XMLParseOutput item(label, range);
          sourceLabels.push_back(item);
        }

        // specified translations -> vector of phrases, separated by "||"
        if (translation.length() > 0 && StaticData::Instance().GetXmlInputType() != XmlIgnore) {
          vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||");
          vector<string> altLabel = TokenizeMultiCharSeparator(label, "||");
          vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||");
          //TRACE_ERR("number of translations: " << altTexts.size() << endl);
          for (size_t i=0; i<altTexts.size(); ++i) {
            // set target phrase
            TargetPhrase targetPhrase;
            targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL);

            // set constituent label
            string targetLHSstr;
            if (altLabel.size() > i && altLabel[i].size() > 0) {
              targetLHSstr = altLabel[i];
            } else {
              const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS();
              UnknownLHSList::const_iterator iterLHS = lhsList.begin();
              targetLHSstr = iterLHS->first;
            }
            Word *targetLHS = new Word(true);
            targetLHS->CreateFromString(Output, outputFactorOrder, targetLHSstr, true);
            CHECK(targetLHS->GetFactor(0) != NULL);
            targetPhrase.SetTargetLHS(targetLHS);

            // not tested
            Phrase sourcePhrase = this->GetSubString(WordsRange(startPos,endPos-1));

            // get probability
            float probValue = 1;
            if (altProbs.size() > i && altProbs[i].size() > 0) {
              probValue = Scan<float>(altProbs[i]);
            }
            // convert from prob to log-prob
            float scoreValue = FloorScore(TransformScore(probValue));
            targetPhrase.SetXMLScore(scoreValue);
            targetPhrase.Evaluate(sourcePhrase);

            // set span and create XmlOption
            WordsRange range(startPos+1,endPos);
            XmlOption *option = new XmlOption(range,targetPhrase);
            CHECK(option);
            xmlOptions.push_back(option);

            VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl);
          }
          altTexts.clear();
          altProbs.clear();
        }
      }
    }
  }
  // we are done. check if there are tags that are still open
  if (tagStack.size() > 0) {
    TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl);
    return false;
  }

  // return de-xml'ed sentence in line
  line = cleanLine;
  return true;
}
Beispiel #14
0
//! populate this InputType with data from in stream
int ForestInput::
Read(std::istream &in,
     std::vector<FactorType> const& factorOrder,
     AllOptions const& opts)
{
  using Syntax::F2S::Forest;

  m_forest = boost::make_shared<Forest>();
  m_rootVertex = NULL;
  m_vertexSet.clear();

  std::string line;
  if (std::getline(in, line, '\n').eof()) {
    return 0;
  }

  // The first line contains the sentence number.  We ignore this and skip
  // straight to the second line, which contains the sentence string.
  std::string sentence;
  std::getline(in, sentence);

  // If the next line is blank then there was a parse failure.  Otherwise,
  // the next line and any subsequent non-blank lines contain hyperedges.
  std::getline(in, line);
  if (line == "") {
    // Parse failure.  We treat this as an empty sentence.
    sentence = "";
    // The next line will be blank too.
    std::getline(in, line);
  } else {
    do {
      ParseHyperedgeLine(line, factorOrder);
      std::getline(in, line);
    } while (line != "");
  }

  // Do base class Read().
  // TODO Check if this is actually necessary.  TreeInput does it, but I'm
  // not sure ForestInput needs to.
  std::stringstream strme;
  strme << "<s> " << sentence << " </s>" << std::endl;
  Sentence::Read(strme, factorOrder, opts);

  // Find the maximum end position of any vertex (0 if forest is empty).
  std::size_t maxEnd = FindMaxEnd(*m_forest);

  // Determine which vertices are the top vertices.
  std::vector<Forest::Vertex *> topVertices;
  if (!m_forest->vertices.empty()) {
    FindTopVertices(*m_forest, topVertices);
    assert(topVertices.size() >= 1);
  }

  // Add <s> vertex.
  Forest::Vertex *startSymbol = NULL;
  {
    Word symbol;
    symbol.CreateFromString(Input, factorOrder, "<s>", false);
    Syntax::PVertex pvertex(Range(0, 0), symbol);
    startSymbol = new Forest::Vertex(pvertex);
    m_forest->vertices.push_back(startSymbol);
  }

  // Add </s> vertex.
  Forest::Vertex *endSymbol = NULL;
  {
    Word symbol;
    symbol.CreateFromString(Input, factorOrder, "</s>", false);
    Syntax::PVertex pvertex(Range(maxEnd+1, maxEnd+1), symbol);
    endSymbol = new Forest::Vertex(pvertex);
    m_forest->vertices.push_back(endSymbol);
  }

  // Add root vertex.
  {
    Word symbol;
    symbol.CreateFromString(Input, factorOrder, "Q", true);
    Syntax::PVertex pvertex(Range(0, maxEnd+1), symbol);
    m_rootVertex = new Forest::Vertex(pvertex);
    m_forest->vertices.push_back(m_rootVertex);
  }

  // Add root's incoming hyperedges.
  if (topVertices.empty()) {
    Forest::Hyperedge *e = new Forest::Hyperedge();
    e->head = m_rootVertex;
    e->tail.push_back(startSymbol);
    e->tail.push_back(endSymbol);
    m_rootVertex->incoming.push_back(e);
  } else {
    // Add a hyperedge between [Q] and each top vertex.
    for (std::vector<Forest::Vertex *>::const_iterator
         p = topVertices.begin(); p != topVertices.end(); ++p) {
      Forest::Hyperedge *e = new Forest::Hyperedge();
      e->head = m_rootVertex;
      e->tail.push_back(startSymbol);
      e->tail.push_back(*p);
      e->tail.push_back(endSymbol);
      m_rootVertex->incoming.push_back(e);
    }
  }

  return 1;
}