void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence) { char dirName[] = "/tmp/moses.XXXXXX"; char *temp = mkdtemp(dirName); UTIL_THROW_IF2(temp == NULL, "Couldn't create temporary directory " << dirName); string dirNameStr(dirName); string inFileName(dirNameStr + "/in"); ofstream inFile(inFileName.c_str()); for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) { inFile << inputSentence.GetWord(i); } inFile << endl; inFile.close(); long translationId = inputSentence.GetTranslationId(); string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); // populate with rules for this sentence PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; FormatType format = MosesFormat; // data from file InputFileStream inStream(ptFileName); // copied from class LoaderStandard PrintUserTime("Start loading fuzzy-match phrase model"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; while(getline(inStream, lineOrig)) { const string *line; if (format == HieroFormat) { // reformat line UTIL_THROW(util::Exception, "Cannot be Hiero format"); //line = ReformatHieroRule(lineOrig); } else { // do nothing to format of line line = &lineOrig; } vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, *line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { stringstream strme; strme << "Syntax error at " << ptFileName << ":" << count; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &scoreString = tokens[2] , &alignString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); const size_t numScoreComponents = GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } UTIL_THROW_IF2(scoreVector.size() != numScoreComponents, "Number of scores incorrectly specified"); // parse source & find pt node // constituent labels Word *sourceLHS; Word *targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, factorDelimiter, &sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(); targetPhrase->CreateFromString(Output, m_output, targetPhraseString, factorDelimiter, &targetLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); targetPhrase->Evaluate(sourcePhrase, GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; if (format == HieroFormat) { // reformat line delete line; } else { // do nothing } } // sort and prune each target phrase collection SortAndPrune(rootNode); //removedirectoryrecursively(dirName); }
void WordTranslationFeature::EvaluateWithSourceContext(const InputType &input , const InputPath &inputPath , const TargetPhrase &targetPhrase , const StackVec *stackVec , ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection *estimatedScores) const { const Sentence& sentence = static_cast<const Sentence&>(input); const AlignmentInfo &alignment = targetPhrase.GetAlignTerm(); // process aligned words for (AlignmentInfo::const_iterator alignmentPoint = alignment.begin(); alignmentPoint != alignment.end(); alignmentPoint++) { const Phrase& sourcePhrase = inputPath.GetPhrase(); int sourceIndex = alignmentPoint->first; int targetIndex = alignmentPoint->second; Word ws = sourcePhrase.GetWord(sourceIndex); if (m_factorTypeSource == 0 && ws.IsNonTerminal()) continue; Word wt = targetPhrase.GetWord(targetIndex); if (m_factorTypeSource == 0 && wt.IsNonTerminal()) continue; StringPiece sourceWord = ws.GetFactor(m_factorTypeSource)->GetString(); StringPiece targetWord = wt.GetFactor(m_factorTypeTarget)->GetString(); if (m_ignorePunctuation) { // check if source or target are punctuation char firstChar = sourceWord[0]; CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) continue; firstChar = targetWord[0]; charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) continue; } if (!m_unrestricted) { if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end()) sourceWord = "OTHER"; if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end()) targetWord = "OTHER"; } if (m_simple) { // construct feature name util::StringStream featureName; featureName << m_description << "_"; featureName << sourceWord; featureName << "~"; featureName << targetWord; scoreBreakdown.SparsePlusEquals(featureName.str(), 1); } if (m_domainTrigger && !m_sourceContext) { const bool use_topicid = sentence.GetUseTopicId(); const bool use_topicid_prob = sentence.GetUseTopicIdAndProb(); if (use_topicid || use_topicid_prob) { if(use_topicid) { // use topicid as trigger const long topicid = sentence.GetTopicId(); util::StringStream feature; feature << m_description << "_"; if (topicid == -1) feature << "unk"; else feature << topicid; feature << "_"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } else { // use topic probabilities const vector<string> &topicid_prob = *(input.GetTopicIdAndProb()); if (atol(topicid_prob[0].c_str()) == -1) { util::StringStream feature; feature << m_description << "_unk_"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } else { for (size_t i=0; i+1 < topicid_prob.size(); i+=2) { util::StringStream feature; feature << m_description << "_"; feature << topicid_prob[i]; feature << "_"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str())); } } } } else { // range over domain trigger words (keywords) const long docid = input.GetDocumentId(); for (boost::unordered_set<std::string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) { string sourceTrigger = *p; util::StringStream feature; feature << m_description << "_"; feature << sourceTrigger; feature << "_"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } } } if (m_sourceContext) { size_t globalSourceIndex = inputPath.GetWordsRange().GetStartPos() + sourceIndex; if (!m_domainTrigger && globalSourceIndex == 0) { // add <s> trigger feature for source util::StringStream feature; feature << m_description << "_"; feature << "<s>,"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } // range over source words to get context for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) { if (contextIndex == globalSourceIndex) continue; StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString(); if (m_ignorePunctuation) { // check if trigger is punctuation char firstChar = sourceTrigger[0]; CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) continue; } const long docid = input.GetDocumentId(); bool sourceTriggerExists = false; if (m_domainTrigger) sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger ) != m_vocabDomain[docid].end(); else if (!m_unrestricted) sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); if (m_domainTrigger) { if (sourceTriggerExists) { util::StringStream feature; feature << m_description << "_"; feature << sourceTrigger; feature << "_"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } } else if (m_unrestricted || sourceTriggerExists) { util::StringStream feature; feature << m_description << "_"; if (contextIndex < globalSourceIndex) { feature << sourceTrigger; feature << ","; feature << sourceWord; } else { feature << sourceWord; feature << ","; feature << sourceTrigger; } feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } } } if (m_targetContext) { throw runtime_error("Can't use target words outside current translation option in a stateless feature"); /* size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex; if (globalTargetIndex == 0) { // add <s> trigger feature for source stringstream feature; feature << "wt_"; feature << sourceWord; feature << "~"; feature << "<s>,"; feature << targetWord; accumulator->SparsePlusEquals(feature.str(), 1); } // range over target words (up to current position) to get context for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) { string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString(); if (m_ignorePunctuation) { // check if trigger is punctuation char firstChar = targetTrigger.at(0); CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) continue; } bool targetTriggerExists = false; if (!m_unrestricted) targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end(); if (m_unrestricted || targetTriggerExists) { stringstream feature; feature << "wt_"; feature << sourceWord; feature << "~"; feature << targetTrigger; feature << ","; feature << targetWord; accumulator->SparsePlusEquals(feature.str(), 1); } }*/ } } }