//Called when we start translating a new sentence void PhraseDictionaryFeature::InitDictionary(const TranslationSystem* system, const InputType& source) { PhraseDictionary* dict; if (m_useThreadSafePhraseDictionary) { //thread safe dictionary should already be loaded dict = m_threadSafePhraseDictionary.get(); } else { //thread-unsafe dictionary may need to be loaded if this is a new thread. if (!m_threadUnsafePhraseDictionary.get()) { m_threadUnsafePhraseDictionary.reset(LoadPhraseTable(system)); } dict = m_threadUnsafePhraseDictionary.get(); } CHECK(dict); dict->InitializeForInput(source); }
PhraseDictionary *FindPhraseDictionary(const string &ptName) { const std::vector<PhraseDictionary*> &pts = PhraseDictionary::GetColl(); PhraseDictionary *pt = NULL; std::vector<PhraseDictionary*>::const_iterator iter; for (iter = pts.begin(); iter != pts.end(); ++iter) { PhraseDictionary *currPt = *iter; if (currPt->GetScoreProducerDescription() == ptName) { pt = currPt; break; } } return pt; }
ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells) : m_decodeGraphList(StaticData::Instance().GetDecodeGraphs()), m_source(source) { const StaticData &staticData = StaticData::Instance(); staticData.InitializeForInput(source); CreateInputPaths(m_source); const std::vector<PhraseDictionary*> &dictionaries = PhraseDictionary::GetColl(); assert(dictionaries.size() == m_decodeGraphList.size()); m_ruleLookupManagers.reserve(dictionaries.size()); for (std::size_t i = 0; i < dictionaries.size(); ++i) { const PhraseDictionary *dict = dictionaries[i]; PhraseDictionary *nonConstDict = const_cast<PhraseDictionary*>(dict); std::size_t maxChartSpan = m_decodeGraphList[i]->GetMaxChartSpan(); ChartRuleLookupManager *lookupMgr = nonConstDict->CreateRuleLookupManager(*this, cells, maxChartSpan); m_ruleLookupManagers.push_back(lookupMgr); } }
void TranslationSystem::CleanUpAfterSentenceProcessing(const InputType& source) const { for(size_t i=0;i<m_phraseDictionaries.size();++i) { PhraseDictionaryFeature &phraseDictionaryFeature = *m_phraseDictionaries[i]; PhraseDictionary* phraseDictionary = const_cast<PhraseDictionary*>(phraseDictionaryFeature.GetDictionary()); phraseDictionary->CleanUp(source); } for(size_t i=0;i<m_generationDictionaries.size();++i) m_generationDictionaries[i]->CleanUp(source); //something LMs could do after each sentence LMList::const_iterator iterLM; for (iterLM = m_languageModels.begin() ; iterLM != m_languageModels.end() ; ++iterLM) { LanguageModel &languageModel = **iterLM; languageModel.CleanUpAfterSentenceProcessing(source); } }
ChartParser::ChartParser(InputType const &source, ChartCellCollectionBase &cells) : m_decodeGraphList(StaticData::Instance().GetDecodeGraphs()), m_source(source) { const StaticData &staticData = StaticData::Instance(); staticData.InitializeForInput(source); CreateInputPaths(m_source); const std::vector<PhraseDictionary*> &dictionaries = PhraseDictionary::GetColl(); m_ruleLookupManagers.reserve(dictionaries.size()); for (std::vector<PhraseDictionary*>::const_iterator p = dictionaries.begin(); p != dictionaries.end(); ++p) { const PhraseDictionary *dict = *p; PhraseDictionary *nonConstDict = const_cast<PhraseDictionary*>(dict); ChartRuleLookupManager *lookupMgr = nonConstDict->CreateRuleLookupManager(*this, cells); m_ruleLookupManagers.push_back(lookupMgr); } }
/** * Process a sentence with xml annotation * Xml tags may specifiy additional/replacing translation options * and reordering constraints * * \param line in: sentence, out: sentence without the xml * \param res vector with translation options specified by xml * \param reorderingConstraint reordering constraint zones specified by xml * \param walls reordering constraint walls specified by xml * \param lbrackStr xml tag's left bracket string, typically "<" * \param rbrackStr xml tag's right bracket string, typically ">" */ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls, std::vector< std::pair<size_t, std::string> > &placeholders, int offset, const std::string& lbrackStr, const std::string& rbrackStr) { //parse XML markup in translation line const StaticData &staticData = StaticData::Instance(); // hack. What pt should XML trans opt be assigned to? PhraseDictionary *firstPt = NULL; if (PhraseDictionary::GetColl().size() == 0) { firstPt = PhraseDictionary::GetColl()[0]; } // no xml tag? we're done. //if (line.find_first_of('<') == string::npos) { if (line.find(lbrackStr) == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (<b>), (is a), (</b>), (test .) vector<string> xmlTokens = TokenizeXml(line, lbrackStr, rbrackStr); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) const vector<FactorType> &outputFactorOrder = staticData.GetOutputFactorOrder(); // const string &factorDelimiter = staticData.GetFactorDelimiter(); // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += xmlTokens[xmlTokenPos]; // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)); VERBOSE(3,"XML TAG IS: " << tag << std::endl); if (tag.size() == 0) { TRACE_ERR("ERROR: empty tag name: " << line << endl); return false; } // check if unary (e.g., "<wall/>") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "<a>", not "</a>")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { TRACE_ERR("ERROR: can't have both closed and unary tag " << lbrackStr << tag << rbrackStr << ": " << line << endl); return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl); } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl); return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl ); return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector<string> ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl); return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl); // special tag: wall if (tagName == "wall") { size_t start = (startPos == 0) ? 0 : startPos-1; for(size_t pos = start; pos < endPos; pos++) walls.push_back( pos ); } // special tag: zone else if (tagName == "zone") { if (startPos >= endPos) { TRACE_ERR("ERROR: zone must span at least one word: " << line << endl); return false; } reorderingConstraint.SetZone( startPos, endPos-1 ); } // name-entity placeholder else if (tagName == "ne") { if (startPos != (endPos - 1)) { TRACE_ERR("ERROR: Placeholder must only span 1 word: " << line << endl); return false; } string entity = ParseXmlTagAttribute(tagContent,"entity"); placeholders.push_back(std::pair<size_t, std::string>(startPos, entity)); } // update: add new aligned sentence pair to Mmsapt identified by name else if (tagName == "update") { #if PT_UG // get model name and aligned sentence pair string pdName = ParseXmlTagAttribute(tagContent,"name"); string source = ParseXmlTagAttribute(tagContent,"source"); string target = ParseXmlTagAttribute(tagContent,"target"); string alignment = ParseXmlTagAttribute(tagContent,"alignment"); // find PhraseDictionary by name const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl(); PhraseDictionary* pd = NULL; for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) { PhraseDictionary* curPd = *i; if (curPd->GetScoreProducerDescription() == pdName) { pd = curPd; break; } } if (pd == NULL) { TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl); return false; } // update model VERBOSE(3,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl); Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd); pdsa->add(source, target, alignment); #else TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl); return false; #endif } // weight-overwrite: update feature weights, unspecified weights remain unchanged // IMPORTANT: translation models that cache phrases or apply table-limit during load // based on initial weights need to be reset. Sending an empty update will do this // for PhraseDictionaryBitextSampling (Mmsapt) models: // <update name="TranslationModelName" source=" " target=" " alignment=" " /> else if (tagName == "weight-overwrite") { // is a name->ff map stored anywhere so we don't have to build it every time? const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions(); boost::unordered_map<string, FeatureFunction*> map; BOOST_FOREACH(FeatureFunction* const& ff, ffs) { map[ff->GetScoreProducerDescription()] = ff; } // update each weight listed ScoreComponentCollection allWeights = StaticData::Instance().GetAllWeights(); boost::unordered_map<string, FeatureFunction*>::iterator ffi; string ffName(""); vector<float> ffWeights; vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights")); BOOST_FOREACH(string const& tok, toks) { if (tok.substr(tok.size() - 1, 1) == "=") { // start new feature if (ffName != "") { // set previous feature weights if (ffi != map.end()) { allWeights.Assign(ffi->second, ffWeights); } ffWeights.clear(); } ffName = tok.substr(0, tok.size() - 1); ffi = map.find(ffName); if (ffi == map.end()) { TRACE_ERR("ERROR: No FeatureFunction with name " << ffName << ", no weight update" << endl); } } else { // weight for current feature ffWeights.push_back(Scan<float>(tok)); } } if (ffi != map.end()) { allWeights.Assign(ffi->second, ffWeights); } StaticData::InstanceNonConst().SetAllWeights(allWeights); } // default: opening tag that specifies translation options else { if (startPos > endPos) { TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl); return false; } else if (startPos == endPos) { TRACE_ERR("WARNING: tag " << tagName << " 0 span: " << line << endl); continue; } // specified translations -> vector of phrases // multiple translations may be specified, separated by "||" vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||"); if( altTexts.size() == 1 && altTexts[0] == "" ) altTexts.pop_back(); // happens when nothing specified // deal with legacy annotations: "translation" was called "english" vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||"); if (moreAltTexts.size()>1 || moreAltTexts[0] != "") { for(vector<string>::iterator translation=moreAltTexts.begin(); translation != moreAltTexts.end(); translation++) { string t = *translation; altTexts.push_back( t ); } } // specified probabilities for the translations -> vector of probs vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||"); if( altProbs.size() == 1 && altProbs[0] == "" ) altProbs.pop_back(); // happens when nothing specified // report what we have processed so far VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl); VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl); VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl); VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl); if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) { TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl); return false; } // store translation options into members if (staticData.GetXmlInputType() != XmlIgnore) { // only store options if we aren't ignoring them for (size_t i=0; i<altTexts.size(); ++i) { Phrase sourcePhrase; // TODO don't know what the source phrase is // set default probability float probValue = 1; if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]); // convert from prob to log-prob float scoreValue = FloorScore(TransformScore(probValue)); WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase TargetPhrase targetPhrase(firstPt); // targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL); targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL); // lhs const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); if (!lhsList.empty()) { const Factor *factor = FactorCollection::Instance().AddFactor(lhsList[0].first, true); Word *targetLHS = new Word(true); targetLHS->SetFactor(0, factor); // TODO - other factors too? targetPhrase.SetTargetLHS(targetLHS); } targetPhrase.SetXMLScore(scoreValue); targetPhrase.EvaluateInIsolation(sourcePhrase); XmlOption *option = new XmlOption(range,targetPhrase); assert(option); res.push_back(option); } altTexts.clear(); altProbs.clear(); } } } } }
int main(int argc, char** argv) { bool help; string input_file; string config_file; po::options_description desc("Allowed options"); desc.add_options() ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit") ("input-file,i", po::value<string>(&input_file), "Input file") ("config-file,f", po::value<string>(&config_file), "Config file") ; po::options_description cmdline_options; cmdline_options.add(desc); po::variables_map vm; po::parsed_options parsed = po::command_line_parser(argc,argv). options(cmdline_options).allow_unregistered().run(); po::store(parsed, vm); po::notify(vm); if (help) { usage(desc, argv); exit(0); } if (input_file.empty()) { cerr << "ERROR: Please specify an input file" << endl << endl; usage(desc, argv); exit(1); } if (config_file.empty()) { cerr << "ERROR: Please specify a config file" << endl << endl; usage(desc, argv); exit(1); } vector<string> mosesargs; mosesargs.push_back(argv[0]); mosesargs.push_back("-f"); mosesargs.push_back(config_file); for (size_t i = 0; i < parsed.options.size(); ++i) { if (parsed.options[i].position_key == -1 && !parsed.options[i].unregistered) continue; /* const string& key = parsed.options[i].string_key; if (!key.empty()) { mosesargs.push_back(key); } for (size_t j = 0; j < parsed.options[i].value.size(); ++j) { const string& value = parsed.options[i].value[j]; if (!value.empty()) { mosesargs.push_back(value); } }*/ for (size_t j = 0; j < parsed.options[i].original_tokens.size(); ++j) { mosesargs.push_back(parsed.options[i].original_tokens[j]); } } boost::scoped_ptr<Parameter> params(new Parameter()); char** mosesargv = new char*[mosesargs.size()]; for (size_t i = 0; i < mosesargs.size(); ++i) { mosesargv[i] = new char[mosesargs[i].length() + 1]; strcpy(mosesargv[i], mosesargs[i].c_str()); } if (!params->LoadParam(mosesargs.size(), mosesargv)) { params->Explain(); exit(1); } if (!StaticData::LoadDataStatic(params.get(),argv[0])) { exit(1); } const StaticData &staticData = StaticData::Instance(); const std::vector<FactorType> & input = staticData.GetInputFactorOrder(); //Find the phrase table to evaluate with PhraseDictionary* phraseTable = NULL; const vector<FeatureFunction*>& ffs = FeatureFunction::GetFeatureFunctions(); for (size_t i = 0; i < ffs.size(); ++i) { PhraseDictionary* maybePhraseTable = dynamic_cast< PhraseDictionary*>(ffs[i]); if (maybePhraseTable) { UTIL_THROW_IF(phraseTable,util::Exception,"Can only score translations with one phrase table"); phraseTable = maybePhraseTable; } } UTIL_THROW_IF(!phraseTable,util::Exception,"Unable to find scoring phrase table"); Sentence sentence; phraseTable->InitializeForInput(sentence); // //Load and prune the phrase table. This is taken (with mods) from moses/TranslationModel/RuleTable/LoaderStandard.cpp // string lineOrig; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(input_file.c_str(), progress); // reused variables vector<float> scoreVector; StringPiece line; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); StringPiece previous; while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); if (sourcePhraseString != previous) { outputTopN(previous, phraseTable, input, cout); previous = sourcePhraseString; } } outputTopN(previous, phraseTable, input, cout); return 0; }