void ChartRuleLookupManagerMemory::GetChartRuleCollection( const WordsRange &range, ChartTranslationOptionList &outColl) { size_t relEndPos = range.GetEndPos() - range.GetStartPos(); size_t absEndPos = range.GetEndPos(); // MAIN LOOP. create list of nodes of target phrases // get list of all rules that apply to spans at same starting position DottedRuleColl &dottedRuleCol = *m_dottedRuleColls[range.GetStartPos()]; const DottedRuleList &expandableDottedRuleList = dottedRuleCol.GetExpandableDottedRuleList(); const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel(); // loop through the rules // (note that expandableDottedRuleList can be expanded as the loop runs // through calls to ExtendPartialRuleApplication()) for (size_t ind = 0; ind < expandableDottedRuleList.size(); ++ind) { // rule we are about to extend const DottedRuleInMemory &prevDottedRule = *expandableDottedRuleList[ind]; // we will now try to extend it, starting after where it ended size_t startPos = prevDottedRule.IsRoot() ? range.GetStartPos() : prevDottedRule.GetWordsRange().GetEndPos() + 1; // search for terminal symbol // (if only one more word position needs to be covered) if (startPos == absEndPos) { // look up in rule dictionary, if the current rule can be extended // with the source word in the last position const Word &sourceWord = sourceWordLabel.GetLabel(); const PhraseDictionaryNodeSCFG *node = prevDottedRule.GetLastNode().GetChild(sourceWord); // if we found a new rule -> create it and add it to the list if (node != NULL) { // create the rule #ifdef USE_BOOST_POOL DottedRuleInMemory *dottedRule = m_dottedRulePool.malloc(); new (dottedRule) DottedRuleInMemory(*node, sourceWordLabel, prevDottedRule); #else DottedRuleInMemory *dottedRule = new DottedRuleInMemory(*node, sourceWordLabel, prevDottedRule); #endif dottedRuleCol.Add(relEndPos+1, dottedRule); } } // search for non-terminals size_t endPos, stackInd; // span is already complete covered? nothing can be done if (startPos > absEndPos) continue; else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) { // We're at the root of the prefix tree so won't try to cover the full // span (i.e. we don't allow non-lexical unary rules). However, we need // to match non-unary rules that begin with a non-terminal child, so we // do that in two steps: during this iteration we search for non-terminals // that cover all but the last source word in the span (there won't // already be running nodes for these because that would have required a // non-lexical unary rule match for an earlier span). Any matches will // result in running nodes being appended to the list and on subsequent // iterations (for this same span), we'll extend them to cover the final // word. endPos = absEndPos - 1; stackInd = relEndPos; } else { endPos = absEndPos; stackInd = relEndPos + 1; } ExtendPartialRuleApplication(prevDottedRule, startPos, endPos, stackInd, dottedRuleCol); } // list of rules that that cover the entire span DottedRuleList &rules = dottedRuleCol.Get(relEndPos + 1); // look up target sides for the rules DottedRuleList::const_iterator iterRule; for (iterRule = rules.begin(); iterRule != rules.end(); ++iterRule) { const DottedRuleInMemory &dottedRule = **iterRule; const PhraseDictionaryNodeSCFG &node = dottedRule.GetLastNode(); // look up target sides const TargetPhraseCollection *tpc = node.GetTargetPhraseCollection(); // add the fully expanded rule (with lexical target side) if (tpc != NULL) { AddCompletedRule(dottedRule, *tpc, range, outColl); } } dottedRuleCol.Clear(relEndPos+1); outColl.ShrinkToLimit(); }
void ChartRuleLookupManagerMemory::GetChartRuleCollection( const WordsRange &range, bool adhereTableLimit, ChartTranslationOptionList &outColl) { size_t relEndPos = range.GetEndPos() - range.GetStartPos(); size_t absEndPos = range.GetEndPos(); // MAIN LOOP. create list of nodes of target phrases ProcessedRuleColl &processedRuleCol = *m_processedRuleColls[range.GetStartPos()]; const ProcessedRuleList &runningNodes = processedRuleCol.GetRunningNodes(); // Note that runningNodes can be expanded as the loop runs (through calls to // ExtendPartialRuleApplication()). for (size_t ind = 0; ind < runningNodes.size(); ++ind) { const ProcessedRule &prevProcessedRule = *runningNodes[ind]; const PhraseDictionaryNodeSCFG &prevNode = prevProcessedRule.GetLastNode(); const WordConsumed *prevWordConsumed = prevProcessedRule.GetLastWordConsumed(); size_t startPos = (prevWordConsumed == NULL) ? range.GetStartPos() : prevWordConsumed->GetWordsRange().GetEndPos() + 1; // search for terminal symbol if (startPos == absEndPos) { const Word &sourceWord = GetSentence().GetWord(absEndPos); const PhraseDictionaryNodeSCFG *node = prevNode.GetChild(sourceWord); if (node != NULL) { WordConsumed *newWordConsumed = new WordConsumed(absEndPos, absEndPos , sourceWord , prevWordConsumed); ProcessedRule *processedRule = new ProcessedRule(*node, newWordConsumed); processedRuleCol.Add(relEndPos+1, processedRule); } } // search for non-terminals size_t endPos, stackInd; if (startPos > absEndPos) continue; else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) { // start. endPos = absEndPos - 1; stackInd = relEndPos; } else { endPos = absEndPos; stackInd = relEndPos + 1; } const NonTerminalSet &sourceNonTerms = GetSentence().GetLabelSet(startPos, endPos); const NonTerminalSet &targetNonTerms = GetCellCollection().GetHeadwords(WordsRange(startPos, endPos)); ExtendPartialRuleApplication(prevNode, prevWordConsumed, startPos, endPos, stackInd, sourceNonTerms, targetNonTerms, processedRuleCol); } // return list of target phrases ProcessedRuleList &nodes = processedRuleCol.Get(relEndPos + 1); size_t rulesLimit = StaticData::Instance().GetRuleLimit(); ProcessedRuleList::const_iterator iterNode; for (iterNode = nodes.begin(); iterNode != nodes.end(); ++iterNode) { const ProcessedRule &processedRule = **iterNode; const PhraseDictionaryNodeSCFG &node = processedRule.GetLastNode(); const WordConsumed *wordConsumed = processedRule.GetLastWordConsumed(); assert(wordConsumed); const TargetPhraseCollection *targetPhraseCollection = node.GetTargetPhraseCollection(); if (targetPhraseCollection != NULL) { outColl.Add(*targetPhraseCollection, *wordConsumed, adhereTableLimit, rulesLimit); } } outColl.CreateChartRules(rulesLimit); }
// Given a partial rule application ending at startPos-1 and given the sets of // source and target non-terminals covering the span [startPos, endPos], // determines the full or partial rule applications that can be produced through // extending the current rule application by a single non-terminal. void ChartRuleLookupManagerMemory::ExtendPartialRuleApplication( const DottedRuleInMemory &prevDottedRule, size_t startPos, size_t endPos, size_t stackInd, DottedRuleColl & dottedRuleColl) { // source non-terminal labels for the remainder const NonTerminalSet &sourceNonTerms = GetSentence().GetLabelSet(startPos, endPos); // target non-terminal labels for the remainder const ChartCellLabelSet &targetNonTerms = GetCellCollection().Get(WordsRange(startPos, endPos)).GetTargetLabelSet(); // note where it was found in the prefix tree of the rule dictionary const PhraseDictionaryNodeSCFG &node = prevDottedRule.GetLastNode(); const PhraseDictionaryNodeSCFG::NonTerminalMap & nonTermMap = node.GetNonTerminalMap(); const size_t numChildren = nonTermMap.size(); if (numChildren == 0) { return; } const size_t numSourceNonTerms = sourceNonTerms.size(); const size_t numTargetNonTerms = targetNonTerms.GetSize(); const size_t numCombinations = numSourceNonTerms * numTargetNonTerms; // We can search by either: // 1. Enumerating all possible source-target NT pairs that are valid for // the span and then searching for matching children in the node, // or // 2. Iterating over all the NT children in the node, searching // for each source and target NT in the span's sets. // We'll do whichever minimises the number of lookups: if (numCombinations <= numChildren*2) { // loop over possible source non-terminal labels (as found in input tree) NonTerminalSet::const_iterator p = sourceNonTerms.begin(); NonTerminalSet::const_iterator sEnd = sourceNonTerms.end(); for (; p != sEnd; ++p) { const Word & sourceNonTerm = *p; // loop over possible target non-terminal labels (as found in chart) ChartCellLabelSet::const_iterator q = targetNonTerms.begin(); ChartCellLabelSet::const_iterator tEnd = targetNonTerms.end(); for (; q != tEnd; ++q) { const ChartCellLabel &cellLabel = q->second; // try to match both source and target non-terminal const PhraseDictionaryNodeSCFG * child = node.GetChild(sourceNonTerm, cellLabel.GetLabel()); // nothing found? then we are done if (child == NULL) { continue; } // create new rule #ifdef USE_BOOST_POOL DottedRuleInMemory *rule = m_dottedRulePool.malloc(); new (rule) DottedRuleInMemory(*child, cellLabel, prevDottedRule); #else DottedRuleInMemory *rule = new DottedRuleInMemory(*child, cellLabel, prevDottedRule); #endif dottedRuleColl.Add(stackInd, rule); } } } else { // loop over possible expansions of the rule PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator p; PhraseDictionaryNodeSCFG::NonTerminalMap::const_iterator end = nonTermMap.end(); for (p = nonTermMap.begin(); p != end; ++p) { // does it match possible source and target non-terminals? const PhraseDictionaryNodeSCFG::NonTerminalMapKey &key = p->first; const Word &sourceNonTerm = key.first; if (sourceNonTerms.find(sourceNonTerm) == sourceNonTerms.end()) { continue; } const Word &targetNonTerm = key.second; const ChartCellLabel *cellLabel = targetNonTerms.Find(targetNonTerm); if (!cellLabel) { continue; } // create new rule const PhraseDictionaryNodeSCFG &child = p->second; #ifdef USE_BOOST_POOL DottedRuleInMemory *rule = m_dottedRulePool.malloc(); new (rule) DottedRuleInMemory(child, *cellLabel, prevDottedRule); #else DottedRuleInMemory *rule = new DottedRuleInMemory(child, *cellLabel, prevDottedRule); #endif dottedRuleColl.Add(stackInd, rule); } } }
void ChartRuleLookupManagerOnDisk::GetChartRuleCollection( const WordsRange &range, bool adhereTableLimit, ChartTranslationOptionList &outColl) { const StaticData &staticData = StaticData::Instance(); size_t rulesLimit = staticData.GetRuleLimit(); size_t relEndPos = range.GetEndPos() - range.GetStartPos(); size_t absEndPos = range.GetEndPos(); // MAIN LOOP. create list of nodes of target phrases DottedRuleStackOnDisk &expandableDottedRuleList = *m_expandableDottedRuleListVec[range.GetStartPos()]; // sort save nodes so only do nodes with most counts expandableDottedRuleList.SortSavedNodes(); const DottedRuleStackOnDisk::SavedNodeColl &savedNodeColl = expandableDottedRuleList.GetSavedNodeColl(); //cerr << "savedNodeColl=" << savedNodeColl.size() << " "; const ChartCellLabel &sourceWordLabel = GetCellCollection().Get(WordsRange(absEndPos, absEndPos)).GetSourceWordLabel(); for (size_t ind = 0; ind < (savedNodeColl.size()) ; ++ind) { const SavedNodeOnDisk &savedNode = *savedNodeColl[ind]; const DottedRuleOnDisk &prevDottedRule = savedNode.GetDottedRule(); const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode(); size_t startPos = prevDottedRule.IsRoot() ? range.GetStartPos() : prevDottedRule.GetWordsRange().GetEndPos() + 1; // search for terminal symbol if (startPos == absEndPos) { OnDiskPt::Word *sourceWordBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceWordLabel.GetLabel()); if (sourceWordBerkeleyDb != NULL) { const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceWordBerkeleyDb, m_dbWrapper); if (node != NULL) { // TODO figure out why source word is needed from node, not from sentence // prob to do with factors or non-term //const Word &sourceWord = node->GetSourceWord(); DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, sourceWordLabel, prevDottedRule); expandableDottedRuleList.Add(relEndPos+1, dottedRule); // cache for cleanup m_sourcePhraseNode.push_back(node); } delete sourceWordBerkeleyDb; } } // search for non-terminals size_t endPos, stackInd; if (startPos > absEndPos) continue; else if (startPos == range.GetStartPos() && range.GetEndPos() > range.GetStartPos()) { // start. endPos = absEndPos - 1; stackInd = relEndPos; } else { endPos = absEndPos; stackInd = relEndPos + 1; } // size_t nonTermNumWordsCovered = endPos - startPos + 1; // get target nonterminals in this span from chart const ChartCellLabelSet &chartNonTermSet = GetCellCollection().Get(WordsRange(startPos, endPos)).GetTargetLabelSet(); //const Word &defaultSourceNonTerm = staticData.GetInputDefaultNonTerminal() // ,&defaultTargetNonTerm = staticData.GetOutputDefaultNonTerminal(); // go through each SOURCE lhs const NonTerminalSet &sourceLHSSet = GetSentence().GetLabelSet(startPos, endPos); NonTerminalSet::const_iterator iterSourceLHS; for (iterSourceLHS = sourceLHSSet.begin(); iterSourceLHS != sourceLHSSet.end(); ++iterSourceLHS) { const Word &sourceLHS = *iterSourceLHS; OnDiskPt::Word *sourceLHSBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceLHS); if (sourceLHSBerkeleyDb == NULL) { delete sourceLHSBerkeleyDb; continue; // vocab not in pt. node definately won't be in there } const OnDiskPt::PhraseNode *sourceNode = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper); delete sourceLHSBerkeleyDb; if (sourceNode == NULL) continue; // didn't find source node // go through each TARGET lhs ChartCellLabelSet::const_iterator iterChartNonTerm; for (iterChartNonTerm = chartNonTermSet.begin(); iterChartNonTerm != chartNonTermSet.end(); ++iterChartNonTerm) { const ChartCellLabel &cellLabel = *iterChartNonTerm; //cerr << sourceLHS << " " << defaultSourceNonTerm << " " << chartNonTerm << " " << defaultTargetNonTerm << endl; //bool isSyntaxNonTerm = (sourceLHS != defaultSourceNonTerm) || (chartNonTerm != defaultTargetNonTerm); bool doSearch = true; //isSyntaxNonTerm ? nonTermNumWordsCovered <= maxSyntaxSpan : // nonTermNumWordsCovered <= maxDefaultSpan; if (doSearch) { OnDiskPt::Word *chartNonTermBerkeleyDb = m_dbWrapper.ConvertFromMoses(Output, m_outputFactorsVec, cellLabel.GetLabel()); if (chartNonTermBerkeleyDb == NULL) continue; const OnDiskPt::PhraseNode *node = sourceNode->GetChild(*chartNonTermBerkeleyDb, m_dbWrapper); delete chartNonTermBerkeleyDb; if (node == NULL) continue; // found matching entry //const Word &sourceWord = node->GetSourceWord(); DottedRuleOnDisk *dottedRule = new DottedRuleOnDisk(*node, cellLabel, prevDottedRule); expandableDottedRuleList.Add(stackInd, dottedRule); m_sourcePhraseNode.push_back(node); } } // for (iterChartNonTerm delete sourceNode; } // for (iterLabelListf // return list of target phrases DottedRuleCollOnDisk &nodes = expandableDottedRuleList.Get(relEndPos + 1); // source LHS DottedRuleCollOnDisk::const_iterator iterDottedRuleColl; for (iterDottedRuleColl = nodes.begin(); iterDottedRuleColl != nodes.end(); ++iterDottedRuleColl) { // node of last source word const DottedRuleOnDisk &prevDottedRule = **iterDottedRuleColl; if (prevDottedRule.Done()) continue; prevDottedRule.Done(true); const OnDiskPt::PhraseNode &prevNode = prevDottedRule.GetLastNode(); //get node for each source LHS const NonTerminalSet &lhsSet = GetSentence().GetLabelSet(range.GetStartPos(), range.GetEndPos()); NonTerminalSet::const_iterator iterLabelSet; for (iterLabelSet = lhsSet.begin(); iterLabelSet != lhsSet.end(); ++iterLabelSet) { const Word &sourceLHS = *iterLabelSet; OnDiskPt::Word *sourceLHSBerkeleyDb = m_dbWrapper.ConvertFromMoses(Input, m_inputFactorsVec, sourceLHS); if (sourceLHSBerkeleyDb == NULL) continue; const TargetPhraseCollection *targetPhraseCollection = NULL; const OnDiskPt::PhraseNode *node = prevNode.GetChild(*sourceLHSBerkeleyDb, m_dbWrapper); if (node) { UINT64 tpCollFilePos = node->GetValue(); std::map<UINT64, const TargetPhraseCollection*>::const_iterator iterCache = m_cache.find(tpCollFilePos); if (iterCache == m_cache.end()) { const OnDiskPt::TargetPhraseCollection *tpcollBerkeleyDb = node->GetTargetPhraseCollection(m_dictionary.GetTableLimit(), m_dbWrapper); targetPhraseCollection = tpcollBerkeleyDb->ConvertToMoses(m_inputFactorsVec ,m_outputFactorsVec ,m_dictionary ,m_weight ,m_wpProducer ,*m_languageModels ,m_filePath , m_dbWrapper.GetVocab()); delete tpcollBerkeleyDb; m_cache[tpCollFilePos] = targetPhraseCollection; } else { // just get out of cache targetPhraseCollection = iterCache->second; } assert(targetPhraseCollection); if (!targetPhraseCollection->IsEmpty()) { outColl.Add(*targetPhraseCollection, prevDottedRule, GetCellCollection(), adhereTableLimit, rulesLimit); } } // if (node) delete node; delete sourceLHSBerkeleyDb; } } } // for (size_t ind = 0; ind < savedNodeColl.size(); ++ind) outColl.CreateChartRules(rulesLimit); //cerr << numDerivations << " "; }