C++ (Cpp) PhrasePair示例，PhrasePair C++ (Cpp)示例

示例#1

0

显示文件

文件： BilingualDynSuffixArray.cpp 项目： adelcosma/mosesdecoder

pair<float, float>
BilingualDynSuffixArray::
GetLexicalWeight(const PhrasePair& pp) const
{
  // sp,tp: sum of link probabilities
  // sc,tc: count of links
  int src_size = pp.GetSourceSize();
  int trg_size = pp.GetTargetSize();
  vector<float> sp(src_size, 0), tp(trg_size, 0);
  vector<int>   sc(src_size,0),  tc(trg_size,0);
  wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex)));
  wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex)));
  vector<short> const & a = m_rawAlignments.at(pp.m_sntIndex);
  for (size_t i = 0; i < a.size(); i += 2) {
    int s = a[i], t = a.at(i+1), sx, tx;
    // sx, tx: local positions within phrase pair

    if (s < pp.m_startSource || t < pp.m_startTarget) continue;
    if ((sx = s - pp.m_startSource) >= src_size) continue;
    if ((tx = t - pp.m_startTarget) >= trg_size) continue;

    sp[sx] += m_wrd_cooc.pfwd(sw[s],tw[t]);
    tp[tx] += m_wrd_cooc.pbwd(sw[s],tw[t]);
    ++sc[sx];
    ++tc[tx];
#if 0
    cout << m_srcVocab->GetWord(sw[s])   << " -> "
         << m_trgVocab->GetWord(tw[t])   << " "
         << m_wrd_cooc.pfwd(sw[s],tw[t]) << " "
         << m_wrd_cooc.pbwd(sw[s],tw[t]) << " "
         << sp[sx] << " (" << sc[sx] << ") "
         << tp[tx] << " (" << tc[tx] << ") "
         << endl;
#endif
  }
  pair<float,float> ret(1,1);
  wordID_t null_trg = m_trgVocab->GetkOOVWordID();
  wordID_t null_src = m_srcVocab->GetkOOVWordID();
  size_t soff = pp.m_startSource;
  for (size_t i = 0; i < sp.size(); ++i) {
    if (sc[i]) ret.first *= sp[i]/sc[i];
    else       ret.first *= m_wrd_cooc.pfwd(sw[soff+i], null_trg);
  }
  size_t toff = pp.m_startTarget;
  for (size_t i = 0; i < tp.size(); ++i) {
    if (tc[i]) ret.second *= tp[i]/tc[i];
    else       ret.second *= m_wrd_cooc.pbwd(null_src,tw[toff+i]);
  }
  return ret;
}

示例#2

0

显示文件

文件： BilingualDynSuffixArray.cpp 项目： svetakrasikova/ADSKMosesTraining

SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const 
{
	// takes sentence indexes and looks up vocab IDs
	SAPhrase phraseIds(phrasepair.GetTargetSize());
	int sntIndex = phrasepair.m_sntIndex;
	int id(-1), pos(0);
	for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words
		id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i);
		phraseIds.SetId(pos++, id);
	}
	return phraseIds;
}

示例#3

0

显示文件

文件： BilingualDynSuffixArray.cpp 项目： akartbayev/mosesdecoder

pair<float, float>
BilingualDynSuffixArray::
GetLexicalWeight(const PhrasePair& pp) const
{
  // sp,tp: sum of link probabilities
  // sc,tc: count of links
  int src_size = pp.GetSourceSize();
  int trg_size = pp.GetTargetSize();
  vector<float> sp(src_size, 0), tp(trg_size, 0);
  vector<int>   sc(src_size,0),  tc(trg_size,0);
  wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex)));
  wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex)));
  vector<short> const & a = m_rawAlignments.at(pp.m_sntIndex);
  for (size_t i = 0; i < a.size(); i += 2) {
    int s = a[i], t = a.at(i+1), sx, tx;
    // sx, tx: local positions within phrase pair

    if (s < pp.m_startSource || t < pp.m_startTarget) continue;
    if ((sx = s - pp.m_startSource) >= src_size) continue;
    if ((tx = t - pp.m_startTarget) >= trg_size) continue;

    sp[sx] += m_wrd_cooc.pfwd(sw[s],tw[t]);
    tp[tx] += m_wrd_cooc.pbwd(sw[s],tw[t]);
    ++sc[sx];
    ++tc[tx];
#if 0
    cout << m_srcVocab->GetWord(sw[s])   << " -> "
         << m_trgVocab->GetWord(tw[t])   << " "
         << m_wrd_cooc.pfwd(sw[s],tw[t]) << " "
         << m_wrd_cooc.pbwd(sw[s],tw[t]) << " "
         << sp[sx] << " (" << sc[sx] << ") "
         << tp[tx] << " (" << tc[tx] << ") "
         << endl;
#endif
  }
  pair<float,float> ret(1,1);
  wordID_t null_trg = m_trgVocab->GetkOOVWordID();
  wordID_t null_src = m_srcVocab->GetkOOVWordID();
  size_t soff = pp.m_startSource;
  for (size_t i = 0; i < sp.size(); ++i) {
    if (sc[i]) ret.first *= sp[i]/sc[i];
    else       ret.first *= m_wrd_cooc.pfwd(sw[soff+i], null_trg);
  }
  size_t toff = pp.m_startTarget;
  for (size_t i = 0; i < tp.size(); ++i) {
    if (tc[i]) ret.second *= tp[i]/tc[i];
    else       ret.second *= m_wrd_cooc.pbwd(null_src,tw[toff+i]);
  }
  return ret;

  // //return pair<float, float>(1, 1);
  // float srcLexWeight(1.0), trgLexWeight(1.0);
  // map<pair<wordID_t, wordID_t>, float> targetProbs;
  // // collects sum of target probs given source words

  // //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
  // const SentenceAlignment alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
  // map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
  // // for each source word
  // for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
  //   float srcSumPairProbs(0);
  //   wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]);	// localIDs
  //   const vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
  //   // for each target word aligned to this source word in this alignment
  //   if(srcWordAlignments.size() == 0) { // get p(NULL|src)
  // 	pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
  // 	itrCache = m_wordPairCache.find(wordpair);
  // 	if(itrCache == m_wordPairCache.end()) { // if not in cache
  // 	  CacheWordProbs(srcWord);
  // 	  itrCache = m_wordPairCache.find(wordpair); // search cache again
  // 	}
  // 	CHECK(itrCache != m_wordPairCache.end());
  // 	srcSumPairProbs += itrCache->second.first;
  // 	targetProbs[wordpair] = itrCache->second.second;
  //   }
  //   else { // extract p(trg|src)
  // 	for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
  // 	  int trgIdx = srcWordAlignments[i];
  // 	  wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
  // 	  // get probability of this source->target word pair
  // 	  pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
  // 	  itrCache = m_wordPairCache.find(wordpair);
  // 	  if(itrCache == m_wordPairCache.end()) { // if not in cache
  // 	    CacheWordProbs(srcWord);
  // 	    itrCache = m_wordPairCache.find(wordpair); // search cache again
  // 	  }
  // 	  CHECK(itrCache != m_wordPairCache.end());
  // 	  srcSumPairProbs += itrCache->second.first;
  // 	  targetProbs[wordpair] = itrCache->second.second;
  // 	}
  //   }
  //   float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
  //   srcLexWeight *= (srcNormalizer * srcSumPairProbs);
  // }	// end for each source word

  // for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
  //   float trgSumPairProbs(0);
  //   wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
  //   for (map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
  // 	     = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
  // 	if(trgItr->first.second == trgWord)
  // 	  trgSumPairProbs += trgItr->second;
  //   }
  //   if(trgSumPairProbs == 0) continue;	// currently don't store target-side SA
  //   int noAligned = alignment.numberAligned.at(trgIdx);
  //   float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
  //   trgLexWeight *= (trgNormalizer * trgSumPairProbs);
  // }

  // // TODO::Need to get p(NULL|trg)

  // return pair<float, float>(srcLexWeight, trgLexWeight);
}