pair<float, float> BilingualDynSuffixArray:: GetLexicalWeight(const PhrasePair& pp) const { // sp,tp: sum of link probabilities // sc,tc: count of links int src_size = pp.GetSourceSize(); int trg_size = pp.GetTargetSize(); vector<float> sp(src_size, 0), tp(trg_size, 0); vector<int> sc(src_size,0), tc(trg_size,0); wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex))); wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex))); vector<short> const & a = m_rawAlignments.at(pp.m_sntIndex); for (size_t i = 0; i < a.size(); i += 2) { int s = a[i], t = a.at(i+1), sx, tx; // sx, tx: local positions within phrase pair if (s < pp.m_startSource || t < pp.m_startTarget) continue; if ((sx = s - pp.m_startSource) >= src_size) continue; if ((tx = t - pp.m_startTarget) >= trg_size) continue; sp[sx] += m_wrd_cooc.pfwd(sw[s],tw[t]); tp[tx] += m_wrd_cooc.pbwd(sw[s],tw[t]); ++sc[sx]; ++tc[tx]; #if 0 cout << m_srcVocab->GetWord(sw[s]) << " -> " << m_trgVocab->GetWord(tw[t]) << " " << m_wrd_cooc.pfwd(sw[s],tw[t]) << " " << m_wrd_cooc.pbwd(sw[s],tw[t]) << " " << sp[sx] << " (" << sc[sx] << ") " << tp[tx] << " (" << tc[tx] << ") " << endl; #endif } pair<float,float> ret(1,1); wordID_t null_trg = m_trgVocab->GetkOOVWordID(); wordID_t null_src = m_srcVocab->GetkOOVWordID(); size_t soff = pp.m_startSource; for (size_t i = 0; i < sp.size(); ++i) { if (sc[i]) ret.first *= sp[i]/sc[i]; else ret.first *= m_wrd_cooc.pfwd(sw[soff+i], null_trg); } size_t toff = pp.m_startTarget; for (size_t i = 0; i < tp.size(); ++i) { if (tc[i]) ret.second *= tp[i]/tc[i]; else ret.second *= m_wrd_cooc.pbwd(null_src,tw[toff+i]); } return ret; }
SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const { // takes sentence indexes and looks up vocab IDs SAPhrase phraseIds(phrasepair.GetTargetSize()); int sntIndex = phrasepair.m_sntIndex; int id(-1), pos(0); for(int i=phrasepair.m_startTarget; i <= phrasepair.m_endTarget; ++i) { // look up trg words id = m_trgCorpus->at(m_trgSntBreaks[sntIndex] + i); phraseIds.SetId(pos++, id); } return phraseIds; }
pair<float, float> BilingualDynSuffixArray:: GetLexicalWeight(const PhrasePair& pp) const { // sp,tp: sum of link probabilities // sc,tc: count of links int src_size = pp.GetSourceSize(); int trg_size = pp.GetTargetSize(); vector<float> sp(src_size, 0), tp(trg_size, 0); vector<int> sc(src_size,0), tc(trg_size,0); wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex))); wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex))); vector<short> const & a = m_rawAlignments.at(pp.m_sntIndex); for (size_t i = 0; i < a.size(); i += 2) { int s = a[i], t = a.at(i+1), sx, tx; // sx, tx: local positions within phrase pair if (s < pp.m_startSource || t < pp.m_startTarget) continue; if ((sx = s - pp.m_startSource) >= src_size) continue; if ((tx = t - pp.m_startTarget) >= trg_size) continue; sp[sx] += m_wrd_cooc.pfwd(sw[s],tw[t]); tp[tx] += m_wrd_cooc.pbwd(sw[s],tw[t]); ++sc[sx]; ++tc[tx]; #if 0 cout << m_srcVocab->GetWord(sw[s]) << " -> " << m_trgVocab->GetWord(tw[t]) << " " << m_wrd_cooc.pfwd(sw[s],tw[t]) << " " << m_wrd_cooc.pbwd(sw[s],tw[t]) << " " << sp[sx] << " (" << sc[sx] << ") " << tp[tx] << " (" << tc[tx] << ") " << endl; #endif } pair<float,float> ret(1,1); wordID_t null_trg = m_trgVocab->GetkOOVWordID(); wordID_t null_src = m_srcVocab->GetkOOVWordID(); size_t soff = pp.m_startSource; for (size_t i = 0; i < sp.size(); ++i) { if (sc[i]) ret.first *= sp[i]/sc[i]; else ret.first *= m_wrd_cooc.pfwd(sw[soff+i], null_trg); } size_t toff = pp.m_startTarget; for (size_t i = 0; i < tp.size(); ++i) { if (tc[i]) ret.second *= tp[i]/tc[i]; else ret.second *= m_wrd_cooc.pbwd(null_src,tw[toff+i]); } return ret; // //return pair<float, float>(1, 1); // float srcLexWeight(1.0), trgLexWeight(1.0); // map<pair<wordID_t, wordID_t>, float> targetProbs; // // collects sum of target probs given source words // //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex]; // const SentenceAlignment alignment = GetSentenceAlignment(phrasepair.m_sntIndex); // map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache; // // for each source word // for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) { // float srcSumPairProbs(0); // wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]); // localIDs // const vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx); // // for each target word aligned to this source word in this alignment // if(srcWordAlignments.size() == 0) { // get p(NULL|src) // pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID()); // itrCache = m_wordPairCache.find(wordpair); // if(itrCache == m_wordPairCache.end()) { // if not in cache // CacheWordProbs(srcWord); // itrCache = m_wordPairCache.find(wordpair); // search cache again // } // CHECK(itrCache != m_wordPairCache.end()); // srcSumPairProbs += itrCache->second.first; // targetProbs[wordpair] = itrCache->second.second; // } // else { // extract p(trg|src) // for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word // int trgIdx = srcWordAlignments[i]; // wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); // // get probability of this source->target word pair // pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord); // itrCache = m_wordPairCache.find(wordpair); // if(itrCache == m_wordPairCache.end()) { // if not in cache // CacheWordProbs(srcWord); // itrCache = m_wordPairCache.find(wordpair); // search cache again // } // CHECK(itrCache != m_wordPairCache.end()); // srcSumPairProbs += itrCache->second.first; // targetProbs[wordpair] = itrCache->second.second; // } // } // float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size()); // srcLexWeight *= (srcNormalizer * srcSumPairProbs); // } // end for each source word // for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) { // float trgSumPairProbs(0); // wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]); // for (map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr // = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) { // if(trgItr->first.second == trgWord) // trgSumPairProbs += trgItr->second; // } // if(trgSumPairProbs == 0) continue; // currently don't store target-side SA // int noAligned = alignment.numberAligned.at(trgIdx); // float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned); // trgLexWeight *= (trgNormalizer * trgSumPairProbs); // } // // TODO::Need to get p(NULL|trg) // return pair<float, float>(srcLexWeight, trgLexWeight); }