コード例 #1
0
void PhraseTableCreator::EncodeTargetPhraseREnc(std::vector<std::string>& s,
    std::vector<std::string>& t,
    std::set<AlignPoint>& a,
    std::ostream& os)
{
  std::stringstream encodedTargetPhrase;

  std::vector<std::vector<size_t> > a2(t.size());
  for(std::set<AlignPoint>::iterator it = a.begin(); it != a.end(); it++)
    a2[it->second].push_back(it->first);

  for(size_t i = 0; i < t.size(); i++) {
    unsigned idxTarget = GetOrAddTargetSymbolId(t[i]);
    unsigned encodedSymbol = -1;

    unsigned bestSrcPos = s.size();
    unsigned bestDiff = s.size();
    unsigned bestRank = m_lexicalTable.size();
    unsigned badRank = m_lexicalTable.size();

    for(std::vector<size_t>::iterator it = a2[i].begin(); it != a2[i].end(); it++) {
      unsigned idxSource = GetSourceSymbolId(s[*it]);
      size_t r = GetRank(idxSource, idxTarget);
      if(r != badRank) {
        if(r < bestRank) {
          bestRank = r;
          bestSrcPos = *it;
          bestDiff = abs(*it-i);
        } else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff) {
          bestSrcPos = *it;
          bestDiff = abs(*it-i);
        }
      }
    }

    if(bestRank != badRank && bestSrcPos < s.size()) {
      if(bestSrcPos == i)
        encodedSymbol = EncodeREncSymbol3(bestRank);
      else
        encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank);
      a.erase(AlignPoint(bestSrcPos, i));
    } else {
      encodedSymbol = EncodeREncSymbol1(idxTarget);
    }

    os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
    m_symbolCounter.Increase(encodedSymbol);
  }

  unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol);
  unsigned encodedSymbol = EncodeREncSymbol1(stopSymbolId);
  os.write((char*)&encodedSymbol, sizeof(encodedSymbol));
  m_symbolCounter.Increase(encodedSymbol);
}
コード例 #2
0
ファイル: PhraseDecoder.cpp プロジェクト: Avmb/mosesdecoder
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection(
  TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream,
  const Phrase &sourcePhrase, bool topLevel)
{
  
  bool extending = tpv->size();
  size_t bitsLeft = encodedBitStream.TellFromEnd();
    
  typedef std::pair<size_t, size_t> AlignPointSizeT;
  
  std::vector<int> sourceWords;
  if(m_coding == REnc)
  {
    for(size_t i = 0; i < sourcePhrase.GetSize(); i++)
    {
      std::string sourceWord
        = sourcePhrase.GetWord(i).GetString(*m_input, false);
      unsigned idx = GetSourceSymbolId(sourceWord);
      sourceWords.push_back(idx);
    }
  }
  
  unsigned phraseStopSymbol = 0;
  AlignPoint alignStopSymbol(-1, -1);
  
  std::vector<float> scores;
  std::set<AlignPointSizeT> alignment;
  
  enum DecodeState { New, Symbol, Score, Alignment, Add } state = New;
  
  size_t srcSize = sourcePhrase.GetSize();
  
  TargetPhrase* targetPhrase = NULL;
  while(encodedBitStream.TellFromEnd())
  {
     
    if(state == New)
    {
      // Creating new TargetPhrase on the heap
      tpv->push_back(TargetPhrase(Output));
      targetPhrase = &tpv->back();
      
      targetPhrase->SetSourcePhrase(sourcePhrase);
      alignment.clear();
      scores.clear();
        
      state = Symbol;
    }
    
    if(state == Symbol)
    {
      unsigned symbol = m_symbolTree->Read(encodedBitStream);      
      if(symbol == phraseStopSymbol)
      {
        state = Score;
      }
      else
      {
        if(m_coding == REnc)
        {
          std::string wordString;
          size_t type = GetREncType(symbol);
          
          if(type == 1)
          {
            unsigned decodedSymbol = DecodeREncSymbol1(symbol);
            wordString = GetTargetSymbol(decodedSymbol);
          }
          else if (type == 2)
          {
            size_t rank = DecodeREncSymbol2Rank(symbol);
            size_t srcPos = DecodeREncSymbol2Position(symbol);
            
            if(srcPos >= sourceWords.size())
              return TargetPhraseVectorPtr();  
            
            wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));
            if(m_phraseDictionary.m_useAlignmentInfo)
            {
              size_t trgPos = targetPhrase->GetSize();
              alignment.insert(AlignPoint(srcPos, trgPos));
            }
          }
          else if(type == 3)
          {
            size_t rank = DecodeREncSymbol3(symbol);
            size_t srcPos = targetPhrase->GetSize();
            
            if(srcPos >= sourceWords.size())
              return TargetPhraseVectorPtr();  
                            
            wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank));   
            if(m_phraseDictionary.m_useAlignmentInfo)
            {
              size_t trgPos = srcPos;
              alignment.insert(AlignPoint(srcPos, trgPos));
            }
          }
          
          Word word;
          word.CreateFromString(Output, *m_output, wordString, false);
          targetPhrase->AddWord(word);
        }
        else if(m_coding == PREnc)
        {
          // if the symbol is just a word
          if(GetPREncType(symbol) == 1)
          {
            unsigned decodedSymbol = DecodePREncSymbol1(symbol);
     
            Word word;
            word.CreateFromString(Output, *m_output,
                                  GetTargetSymbol(decodedSymbol), false);
            targetPhrase->AddWord(word);
          }
          // if the symbol is a subphrase pointer
          else
          {
            int left = DecodePREncSymbol2Left(symbol);
            int right = DecodePREncSymbol2Right(symbol);
            unsigned rank = DecodePREncSymbol2Rank(symbol);
            
            int srcStart = left + targetPhrase->GetSize();
            int srcEnd   = srcSize - right - 1;
            
            // false positive consistency check
            if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize)
              return TargetPhraseVectorPtr();
            
            // false positive consistency check
            if(m_maxRank && rank > m_maxRank)
                return TargetPhraseVectorPtr();
            
            // set subphrase by default to itself
            TargetPhraseVectorPtr subTpv = tpv;
            
            // if range smaller than source phrase retrieve subphrase
            if(unsigned(srcEnd - srcStart + 1) != srcSize)
            {
              Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd));
              subTpv = CreateTargetPhraseCollection(subPhrase, false);
            }
            
            // false positive consistency check
            if(subTpv != NULL && rank < subTpv->size())
            {
              // insert the subphrase into the main target phrase
              TargetPhrase& subTp = subTpv->at(rank);
              if(m_phraseDictionary.m_useAlignmentInfo)
              {
                // reconstruct the alignment data based on the alignment of the subphrase
                for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin();
                    it != subTp.GetAlignmentInfo().end(); it++)
                {
                  alignment.insert(AlignPointSizeT(srcStart + it->first,
                                                   targetPhrase->GetSize() + it->second));
                }
              }
              targetPhrase->Append(subTp);
            }
            else 
              return TargetPhraseVectorPtr();
          }
        }
        else
        {
            Word word;
            word.CreateFromString(Output, *m_output,
                                  GetTargetSymbol(symbol), false);
            targetPhrase->AddWord(word);
        }
      }
    }
    else if(state == Score)
    {
      size_t idx = m_multipleScoreTrees ? scores.size() : 0;
      float score = m_scoreTrees[idx]->Read(encodedBitStream);
      scores.push_back(score);
      
      if(scores.size() == m_numScoreComponent)
      {
        targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels);
        
        if(m_containsAlignmentInfo)
          state = Alignment;
        else
          state = Add;
      }
    }
    else if(state == Alignment)
    {
      AlignPoint alignPoint = m_alignTree->Read(encodedBitStream);
      if(alignPoint == alignStopSymbol)
      {
        state = Add;
      }
      else
      {
        if(m_phraseDictionary.m_useAlignmentInfo)  
          alignment.insert(AlignPointSizeT(alignPoint));
      }
    }
    
    if(state == Add)
    {
      if(m_phraseDictionary.m_useAlignmentInfo)
        targetPhrase->SetAlignmentInfo(alignment);
      
      if(m_coding == PREnc)
      {
        if(!m_maxRank || tpv->size() <= m_maxRank)
          bitsLeft = encodedBitStream.TellFromEnd();
        
        if(!topLevel && m_maxRank && tpv->size() >= m_maxRank)
          break;
      }
      
      if(encodedBitStream.TellFromEnd() <= 8)
        break;
      
      state = New;
    }    
  }
  
  if(m_coding == PREnc && !extending)
  {
    bitsLeft = bitsLeft > 8 ? bitsLeft : 0;
    m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank);
  }
  
  return tpv;
}
コード例 #3
0
void PhraseTableCreator::LoadLexicalTable(std::string filePath)
{
  std::vector<SrcTrgProb> t_lexTable;

  std::cerr << "Reading in lexical table for Rank Encoding" << std::endl;
  std::ifstream lexIn(filePath.c_str(), std::ifstream::in);
  std::string src, trg;
  float prob;

  // Reading in the translation probability lexicon

  std::cerr << "\tLoading from " << filePath << std::endl;
  while(lexIn >> trg >> src >> prob) {
    t_lexTable.push_back(SrcTrgProb(SrcTrgString(src, trg), prob));
    AddSourceSymbolId(src);
    AddTargetSymbolId(trg);
  }

  // Sorting lexicon by source words by lexicographical order, corresponding
  // target words by decreasing probability.

  std::cerr << "\tSorting according to translation rank" << std::endl;
  std::sort(t_lexTable.begin(), t_lexTable.end(), SrcTrgProbSorter());

  // Re-assigning source word ids in lexicographical order

  std::vector<std::string> temp1;
  temp1.resize(m_sourceSymbolsMap.size());
  for(boost::unordered_map<std::string, unsigned>::iterator it
      = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++)
    temp1[it->second] = it->first;

  std::sort(temp1.begin(), temp1.end());

  for(size_t i = 0; i < temp1.size(); i++)
    m_sourceSymbolsMap[temp1[i]] = i;

  // Building the lexicon based on source and target word ids

  std::string srcWord = "";
  size_t srcIdx = 0;
  for(std::vector<SrcTrgProb>::iterator it = t_lexTable.begin();
      it != t_lexTable.end(); it++) {
    // If we encounter a new source word
    if(it->first.first != srcWord) {
      srcIdx = GetSourceSymbolId(it->first.first);

      // Store position of first translation
      if(srcIdx >= m_lexicalTableIndex.size())
        m_lexicalTableIndex.resize(srcIdx + 1);
      m_lexicalTableIndex[srcIdx] = m_lexicalTable.size();
    }

    // Store pair of source word and target word
    size_t trgIdx = GetTargetSymbolId(it->first.second);
    m_lexicalTable.push_back(SrcTrg(srcIdx, trgIdx));

    srcWord = it->first.first;
  }
  std::cerr << "\tLoaded " << m_lexicalTable.size() << " lexical pairs" << std::endl;
  std::cerr << std::endl;
}