void CacheBasedLanguageModel::Load(const std::string file) { //file format //age || n-gram //age || n-gram || n-gram || n-gram || ... //.... //each n-gram is a sequence of n words (no matter of n) // //there is no limit on the size of n // //entries can be repeated, but the last entry overwrites the previous VERBOSE(2,"Loading data from the cache file " << file << std::endl); InputFileStream cacheFile(file); std::string line; int age; std::vector<std::string> words; while (getline(cacheFile, line)) { std::vector<std::string> vecStr = TokenizeMultiCharSeparator( line , "||" ); if (vecStr.size() >= 2) { age = Scan<int>(vecStr[0]); vecStr.erase(vecStr.begin()); Update(vecStr,age); } else { TRACE_ERR("ERROR: The format of the loaded file is wrong: " << line << std::endl); CHECK(false); } } IFVERBOSE(2) Print(); }
void ReformatHieroRule(const string &lineOrig, string &out) { vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, lineOrig, "|||" ); string &sourcePhraseString = tokens[1] , &targetPhraseString = tokens[2] , &scoreString = tokens[3]; map<size_t, pair<size_t, size_t> > ntAlign; ReformatHieroRule(0, sourcePhraseString, ntAlign); ReformatHieroRule(1, targetPhraseString, ntAlign); ReformateHieroScore(scoreString); stringstream align; map<size_t, pair<size_t, size_t> >::const_iterator iterAlign; for (iterAlign = ntAlign.begin(); iterAlign != ntAlign.end(); ++iterAlign) { const pair<size_t, size_t> &alignPoint = iterAlign->second; align << alignPoint.first << "-" << alignPoint.second << " "; } stringstream ret; ret << sourcePhraseString << " ||| " << targetPhraseString << " ||| " << scoreString << " ||| " << align.str(); out = ret.str(); }
vector< vector<string> > Phrase::Parse(const std::string &phraseString, const std::vector<FactorType> &factorOrder, const std::string& factorDelimiter) { bool isMultiCharDelimiter = factorDelimiter.size() > 1; // parse vector< vector<string> > phraseVector; vector<string> annotatedWordVector = Tokenize(phraseString); // KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none // to // "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none" for (size_t phrasePos = 0 ; phrasePos < annotatedWordVector.size() ; phrasePos++) { string &annotatedWord = annotatedWordVector[phrasePos]; vector<string> factorStrVector; if (isMultiCharDelimiter) { factorStrVector = TokenizeMultiCharSeparator(annotatedWord, factorDelimiter); } else { factorStrVector = Tokenize(annotatedWord, factorDelimiter); } // KOMMA|none // to // "KOMMA" "none" if (factorStrVector.size() != factorOrder.size()) { TRACE_ERR( "[ERROR] Malformed input at " << /*StaticData::Instance().GetCurrentInputPosition() <<*/ std::endl << " Expected input to have words composed of " << factorOrder.size() << " factor(s) (form FAC1|FAC2|...)" << std::endl << " but instead received input with " << factorStrVector.size() << " factor(s).\n"); abort(); } phraseVector.push_back(factorStrVector); } return phraseVector; }
void LexicalReorderingTableMemory:: LoadFromFile(const std::string& filePath) { std::string fileName = filePath; if(!FileExists(fileName) && FileExists(fileName+".gz")) fileName += ".gz"; InputFileStream file(fileName); std::string line(""), key(""); int numScores = -1; std::cerr << "Loading table into memory..."; while(!getline(file, line).eof()) { std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||"); int t = 0 ; std::string f(""),e(""),c(""); if(!m_FactorsF.empty()) { //there should be something for f f = auxClearString(tokens.at(t)); ++t; } if(!m_FactorsE.empty()) { //there should be something for e e = auxClearString(tokens.at(t)); ++t; } if(!m_FactorsC.empty()) { //there should be something for c c = auxClearString(tokens.at(t)); ++t; } //last token are the probs std::vector<float> p = Scan<float>(Tokenize(tokens.at(t))); //sanity check: all lines must have equall number of probs if(-1 == numScores) { numScores = (int)p.size(); //set in first line } if((int)p.size() != numScores) { TRACE_ERR( "found inconsistent number of probabilities... found " << p.size() << " expected " << numScores << std::endl); exit(0); } std::transform(p.begin(),p.end(),p.begin(),TransformScore); std::transform(p.begin(),p.end(),p.begin(),FloorScore); //save it all into our map m_Table[MakeKey(f,e,c)] = p; } std::cerr << "done.\n"; }
void DeleteRules::Load() { std::vector<FactorType> factorOrder; factorOrder.push_back(0); // unfactored for now InputFileStream strme(m_path); string line; while (getline(strme, line)) { vector<string> toks = TokenizeMultiCharSeparator(line, "|||"); UTIL_THROW_IF2(toks.size() != 2, "Line must be source ||| target"); Phrase source, target; source.CreateFromString(Input, factorOrder, toks[0], NULL); target.CreateFromString(Output, factorOrder, toks[1], NULL); size_t hash = 0; boost::hash_combine(hash, source); boost::hash_combine(hash, target); m_ruleHashes.insert(hash); } }
void Word::CreateFromString(FactorDirection direction , const std::vector<FactorType> &factorOrder , const std::string &str , bool isNonTerminal) { FactorCollection &factorCollection = FactorCollection::Instance(); vector<string> wordVec; const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); TokenizeMultiCharSeparator(wordVec, str, factorDelimiter); //Tokenize(wordVec, str, "|"); CHECK(wordVec.size() <= factorOrder.size()); const Factor *factor; for (size_t ind = 0; ind < wordVec.size(); ++ind) { FactorType factorType = factorOrder[ind]; factor = factorCollection.AddFactor(direction, factorType, wordVec[ind]); m_factorArray[factorType] = factor; } // assume term/non-term same for all factors m_isNonTerminal = isNonTerminal; }
void TargetPhrase::SetProperties(const StringPiece &str) { if (str.size() == 0) { return; } vector<string> toks; TokenizeMultiCharSeparator(toks, str.as_string(), "{{"); for (size_t i = 0; i < toks.size(); ++i) { string &tok = toks[i]; if (tok.empty()) { continue; } size_t endPos = tok.rfind("}"); tok = tok.substr(0, endPos - 1); vector<string> keyValue = TokenizeFirstOnly(tok, " "); UTIL_THROW_IF2(keyValue.size() != 2, "Incorrect format of property: " << str); SetProperty(keyValue[0], keyValue[1]); } }
/** * Process a sentence with xml annotation * Xml tags may specifiy additional/replacing translation options * and reordering constraints * * \param line in: sentence, out: sentence without the xml * \param res vector with translation options specified by xml * \param reorderingConstraint reordering constraint zones specified by xml * \param walls reordering constraint walls specified by xml * \param lbrackStr xml tag's left bracket string, typically "<" * \param rbrackStr xml tag's right bracket string, typically ">" */ bool ProcessAndStripXMLTags(string &line, vector<XmlOption*> &res, ReorderingConstraint &reorderingConstraint, vector< size_t > &walls, std::vector< std::pair<size_t, std::string> > &placeholders, int offset, const std::string& lbrackStr, const std::string& rbrackStr) { //parse XML markup in translation line const StaticData &staticData = StaticData::Instance(); // hack. What pt should XML trans opt be assigned to? PhraseDictionary *firstPt = NULL; if (PhraseDictionary::GetColl().size() == 0) { firstPt = PhraseDictionary::GetColl()[0]; } // no xml tag? we're done. //if (line.find_first_of('<') == string::npos) { if (line.find(lbrackStr) == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (<b>), (is a), (</b>), (test .) vector<string> xmlTokens = TokenizeXml(line, lbrackStr, rbrackStr); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) const vector<FactorType> &outputFactorOrder = staticData.GetOutputFactorOrder(); // const string &factorDelimiter = staticData.GetFactorDelimiter(); // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += xmlTokens[xmlTokenPos]; // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos], lbrackStr, rbrackStr)); VERBOSE(3,"XML TAG IS: " << tag << std::endl); if (tag.size() == 0) { TRACE_ERR("ERROR: empty tag name: " << line << endl); return false; } // check if unary (e.g., "<wall/>") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "<a>", not "</a>")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { TRACE_ERR("ERROR: can't have both closed and unary tag " << lbrackStr << tag << rbrackStr << ": " << line << endl); return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl); } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl); return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl ); return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector<string> ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl); return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl); // special tag: wall if (tagName == "wall") { size_t start = (startPos == 0) ? 0 : startPos-1; for(size_t pos = start; pos < endPos; pos++) walls.push_back( pos ); } // special tag: zone else if (tagName == "zone") { if (startPos >= endPos) { TRACE_ERR("ERROR: zone must span at least one word: " << line << endl); return false; } reorderingConstraint.SetZone( startPos, endPos-1 ); } // name-entity placeholder else if (tagName == "ne") { if (startPos != (endPos - 1)) { TRACE_ERR("ERROR: Placeholder must only span 1 word: " << line << endl); return false; } string entity = ParseXmlTagAttribute(tagContent,"entity"); placeholders.push_back(std::pair<size_t, std::string>(startPos, entity)); } // update: add new aligned sentence pair to Mmsapt identified by name else if (tagName == "update") { #if PT_UG // get model name and aligned sentence pair string pdName = ParseXmlTagAttribute(tagContent,"name"); string source = ParseXmlTagAttribute(tagContent,"source"); string target = ParseXmlTagAttribute(tagContent,"target"); string alignment = ParseXmlTagAttribute(tagContent,"alignment"); // find PhraseDictionary by name const vector<PhraseDictionary*> &pds = PhraseDictionary::GetColl(); PhraseDictionary* pd = NULL; for (vector<PhraseDictionary*>::const_iterator i = pds.begin(); i != pds.end(); ++i) { PhraseDictionary* curPd = *i; if (curPd->GetScoreProducerDescription() == pdName) { pd = curPd; break; } } if (pd == NULL) { TRACE_ERR("ERROR: No PhraseDictionary with name " << pdName << ", no update" << endl); return false; } // update model VERBOSE(3,"Updating " << pdName << " ||| " << source << " ||| " << target << " ||| " << alignment << endl); Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(pd); pdsa->add(source, target, alignment); #else TRACE_ERR("ERROR: recompile with --with-mm to update PhraseDictionary at runtime" << endl); return false; #endif } // weight-overwrite: update feature weights, unspecified weights remain unchanged // IMPORTANT: translation models that cache phrases or apply table-limit during load // based on initial weights need to be reset. Sending an empty update will do this // for PhraseDictionaryBitextSampling (Mmsapt) models: // <update name="TranslationModelName" source=" " target=" " alignment=" " /> else if (tagName == "weight-overwrite") { // is a name->ff map stored anywhere so we don't have to build it every time? const vector<FeatureFunction*> &ffs = FeatureFunction::GetFeatureFunctions(); boost::unordered_map<string, FeatureFunction*> map; BOOST_FOREACH(FeatureFunction* const& ff, ffs) { map[ff->GetScoreProducerDescription()] = ff; } // update each weight listed ScoreComponentCollection allWeights = StaticData::Instance().GetAllWeights(); boost::unordered_map<string, FeatureFunction*>::iterator ffi; string ffName(""); vector<float> ffWeights; vector<string> toks = Tokenize(ParseXmlTagAttribute(tagContent,"weights")); BOOST_FOREACH(string const& tok, toks) { if (tok.substr(tok.size() - 1, 1) == "=") { // start new feature if (ffName != "") { // set previous feature weights if (ffi != map.end()) { allWeights.Assign(ffi->second, ffWeights); } ffWeights.clear(); } ffName = tok.substr(0, tok.size() - 1); ffi = map.find(ffName); if (ffi == map.end()) { TRACE_ERR("ERROR: No FeatureFunction with name " << ffName << ", no weight update" << endl); } } else { // weight for current feature ffWeights.push_back(Scan<float>(tok)); } } if (ffi != map.end()) { allWeights.Assign(ffi->second, ffWeights); } StaticData::InstanceNonConst().SetAllWeights(allWeights); } // default: opening tag that specifies translation options else { if (startPos > endPos) { TRACE_ERR("ERROR: tag " << tagName << " startPos > endPos: " << line << endl); return false; } else if (startPos == endPos) { TRACE_ERR("WARNING: tag " << tagName << " 0 span: " << line << endl); continue; } // specified translations -> vector of phrases // multiple translations may be specified, separated by "||" vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"translation"), "||"); if( altTexts.size() == 1 && altTexts[0] == "" ) altTexts.pop_back(); // happens when nothing specified // deal with legacy annotations: "translation" was called "english" vector<string> moreAltTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"english"), "||"); if (moreAltTexts.size()>1 || moreAltTexts[0] != "") { for(vector<string>::iterator translation=moreAltTexts.begin(); translation != moreAltTexts.end(); translation++) { string t = *translation; altTexts.push_back( t ); } } // specified probabilities for the translations -> vector of probs vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||"); if( altProbs.size() == 1 && altProbs[0] == "" ) altProbs.pop_back(); // happens when nothing specified // report what we have processed so far VERBOSE(3,"XML TAG NAME IS: '" << tagName << "'" << endl); VERBOSE(3,"XML TAG TRANSLATION IS: '" << altTexts[0] << "'" << endl); VERBOSE(3,"XML TAG PROB IS: '" << altProbs[0] << "'" << endl); VERBOSE(3,"XML TAG SPAN IS: " << startPos << "-" << (endPos-1) << endl); if (altProbs.size() > 0 && altTexts.size() != altProbs.size()) { TRACE_ERR("ERROR: Unequal number of probabilities and translation alternatives: " << line << endl); return false; } // store translation options into members if (staticData.GetXmlInputType() != XmlIgnore) { // only store options if we aren't ignoring them for (size_t i=0; i<altTexts.size(); ++i) { Phrase sourcePhrase; // TODO don't know what the source phrase is // set default probability float probValue = 1; if (altProbs.size() > 0) probValue = Scan<float>(altProbs[i]); // convert from prob to log-prob float scoreValue = FloorScore(TransformScore(probValue)); WordsRange range(startPos + offset,endPos-1 + offset); // span covered by phrase TargetPhrase targetPhrase(firstPt); // targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL); targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i], NULL); // lhs const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); if (!lhsList.empty()) { const Factor *factor = FactorCollection::Instance().AddFactor(lhsList[0].first, true); Word *targetLHS = new Word(true); targetLHS->SetFactor(0, factor); // TODO - other factors too? targetPhrase.SetTargetLHS(targetLHS); } targetPhrase.SetXMLScore(scoreValue); targetPhrase.EvaluateInIsolation(sourcePhrase); XmlOption *option = new XmlOption(range,targetPhrase); assert(option); res.push_back(option); } altTexts.clear(); altProbs.clear(); } } } } }
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , const string &filePath , const vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { const StaticData &staticData = StaticData::Instance(); m_tableLimit = tableLimit; // data from file InputFileStream inFile(filePath); // create hash file if necessary ofstream tempFile; string tempFilePath; vector< vector<string> > phraseVector; string line, prevSourcePhrase = ""; size_t count = 0; size_t line_num = 0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info while(getline(inFile, line)) { ++line_num; vector<string> tokens = TokenizeMultiCharSeparator( line , "|||" ); if (numElement == NOT_FOUND) { // init numElement numElement = tokens.size(); assert(numElement >= 3); // extended style: source ||| target ||| scores ||| [alignment] ||| [counts] } if (tokens.size() != numElement) { stringstream strme; strme << "Syntax error at " << filePath << ":" << line_num; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString=tokens[0] ,&targetPhraseString=tokens[1] ,&scoreString = tokens[2]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty target, skipping\n"); continue; } const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); if (sourcePhraseString != prevSourcePhrase) phraseVector = Phrase::Parse(sourcePhraseString, input, factorDelimiter); vector<float> scoreVector = Tokenize<float>(scoreString); if (scoreVector.size() != m_numScoreComponent) { stringstream strme; strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num; UserMessage::Add(strme.str()); abort(); } // source Phrase sourcePhrase(Input); sourcePhrase.CreateFromString( input, phraseVector); //target TargetPhrase targetPhrase(Output); targetPhrase.SetSourcePhrase(&sourcePhrase); targetPhrase.CreateFromString( output, targetPhraseString, factorDelimiter); if (tokens.size() > 3) targetPhrase.SetAlignmentInfo(tokens[3]); // component score, for n-best output std::vector<float> scv(scoreVector.size()); std::transform(scoreVector.begin(),scoreVector.end(),scv.begin(),TransformScore); std::transform(scv.begin(),scv.end(),scv.begin(),FloorScore); targetPhrase.SetScore(m_feature, scv, weight, weightWP, languageModels); AddEquivPhrase(sourcePhrase, targetPhrase); count++; } // sort each target phrase collection m_collection.Sort(m_tableLimit); return true; }
bool LexicalReorderingTableTree::Create(std::istream& inFile, const std::string& outFileName) { std::string line; //TRACE_ERR("Entering Create...\n"); std::string ofn(outFileName+".binlexr.srctree"), oft(outFileName+".binlexr.tgtdata"), ofi(outFileName+".binlexr.idx"), ofsv(outFileName+".binlexr.voc0"), oftv(outFileName+".binlexr.voc1"); FILE *os = fOpen(ofn.c_str(),"wb"); FILE *ot = fOpen(oft.c_str(),"wb"); //TRACE_ERR("opend files....\n"); typedef PrefixTreeSA<LabelId,OFF_T> PSA; PSA *psa = new PSA; PSA::setDefault(InvalidOffT); WordVoc* voc[3]; LabelId currFirstWord = InvalidLabelId; IPhrase currKey; Candidates cands; std::vector<OFF_T> vo; size_t lnc = 0; size_t numTokens = 0; size_t numKeyTokens = 0; while(getline(inFile, line)) { ++lnc; if(0 == lnc % 10000) { TRACE_ERR("."); } IPhrase key; Scores score; std::vector<std::string> tokens = TokenizeMultiCharSeparator(line, "|||"); std::string w; if(1 == lnc) { //do some init stuff in the first line numTokens = tokens.size(); if(tokens.size() == 2) { //f ||| score numKeyTokens = 1; voc[0] = new WordVoc(); voc[1] = 0; } else if(3 == tokens.size() || 4 == tokens.size()) { //either f ||| e ||| score or f ||| e ||| c ||| score numKeyTokens = 2; voc[0] = new WordVoc(); //f voc voc[1] = new WordVoc(); //e voc voc[2] = voc[1]; //c & e share voc } } else { //sanity check ALL lines must have same number of tokens CHECK(numTokens == tokens.size()); } size_t phrase = 0; for(; phrase < numKeyTokens; ++phrase) { //conditioned on more than just f... need ||| if(phrase >=1) { key.push_back(PrefixTreeMap::MagicWord); } std::istringstream is(tokens[phrase]); while(is >> w) { key.push_back(voc[phrase]->add(w)); } } //collect all non key phrases, i.e. c std::vector<IPhrase> tgt_phrases; tgt_phrases.resize(numTokens - numKeyTokens - 1); for(size_t j = 0; j < tgt_phrases.size(); ++j, ++phrase) { std::istringstream is(tokens[numKeyTokens + j]); while(is >> w) { tgt_phrases[j].push_back(voc[phrase]->add(w)); } } //last token is score std::istringstream is(tokens[numTokens-1]); while(is >> w) { score.push_back(atof(w.c_str())); } //transform score now... std::transform(score.begin(),score.end(),score.begin(),TransformScore); std::transform(score.begin(),score.end(),score.begin(),FloorScore); std::vector<Scores> scores; scores.push_back(score); if(key.empty()) { TRACE_ERR("WARNING: empty source phrase in line '"<<line<<"'\n"); continue; } //first time inits if(currFirstWord == InvalidLabelId) { currFirstWord = key[0]; } if(currKey.empty()) { currKey = key; //insert key into tree CHECK(psa); PSA::Data& d = psa->insert(key); if(d == InvalidOffT) { d = fTell(ot); } else { TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n"); return false; } } if(currKey != key) { //ok new key currKey = key; //a) write cands for old key cands.writeBin(ot); cands.clear(); //b) check if we need to move on to new tree root if(key[0] != currFirstWord) { // write key prefix tree to file and clear PTF pf; if(currFirstWord >= vo.size()) { vo.resize(currFirstWord+1,InvalidOffT); } vo[currFirstWord] = fTell(os); pf.create(*psa, os); // clear delete psa; psa = new PSA; currFirstWord = key[0]; } //c) insert key into tree CHECK(psa); PSA::Data& d = psa->insert(key); if(d == InvalidOffT) { d = fTell(ot); } else { TRACE_ERR("ERROR: source phrase already inserted (A)!\nline(" << lnc << "): '" << line << "\n"); return false; } } cands.push_back(GenericCandidate(tgt_phrases, scores)); } if (lnc == 0) { TRACE_ERR("ERROR: empty lexicalised reordering file\n" << std::endl); return false; } //flush remainders cands.writeBin(ot); cands.clear(); //process last currFirstWord PTF pf; if(currFirstWord >= vo.size()) { vo.resize(currFirstWord+1,InvalidOffT); } vo[currFirstWord] = fTell(os); pf.create(*psa,os); delete psa; psa=0; fClose(os); fClose(ot); /* std::vector<size_t> inv; for(size_t i = 0; i < vo.size(); ++i){ if(vo[i] == InvalidOffT){ inv.push_back(i); } } if(inv.size()) { TRACE_ERR("WARNING: there are src voc entries with no phrase " "translation: count "<<inv.size()<<"\n" "There exists phrase translations for "<<vo.size()-inv.size() <<" entries\n"); } */ FILE *oi = fOpen(ofi.c_str(),"wb"); fWriteVector(oi,vo); fClose(oi); if(voc[0]) { voc[0]->Write(ofsv); delete voc[0]; } if(voc[1]) { voc[1]->Write(oftv); delete voc[1]; } return true; }
void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence) { char dirName[] = "/tmp/moses.XXXXXX"; char *temp = mkdtemp(dirName); UTIL_THROW_IF2(temp == NULL, "Couldn't create temporary directory " << dirName); string dirNameStr(dirName); string inFileName(dirNameStr + "/in"); ofstream inFile(inFileName.c_str()); for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) { inFile << inputSentence.GetWord(i); } inFile << endl; inFile.close(); long translationId = inputSentence.GetTranslationId(); string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); // populate with rules for this sentence PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; FormatType format = MosesFormat; // data from file InputFileStream inStream(ptFileName); // copied from class LoaderStandard PrintUserTime("Start loading fuzzy-match phrase model"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; while(getline(inStream, lineOrig)) { const string *line; if (format == HieroFormat) { // reformat line UTIL_THROW(util::Exception, "Cannot be Hiero format"); //line = ReformatHieroRule(lineOrig); } else { // do nothing to format of line line = &lineOrig; } vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, *line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { stringstream strme; strme << "Syntax error at " << ptFileName << ":" << count; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &scoreString = tokens[2] , &alignString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); const size_t numScoreComponents = GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } UTIL_THROW_IF2(scoreVector.size() != numScoreComponents, "Number of scores incorrectly specified"); // parse source & find pt node // constituent labels Word *sourceLHS; Word *targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, factorDelimiter, &sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(); targetPhrase->CreateFromString(Output, m_output, targetPhraseString, factorDelimiter, &targetLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); targetPhrase->Evaluate(sourcePhrase, GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; if (format == HieroFormat) { // reformat line delete line; } else { // do nothing } } // sort and prune each target phrase collection SortAndPrune(rootNode); //removedirectoryrecursively(dirName); }
/** * Process a sentence with xml annotation * Xml tags may specifiy additional/replacing translation options * and reordering constraints * * \param line in: sentence, out: sentence without the xml * \param res vector with translation options specified by xml * \param reorderingConstraint reordering constraint zones specified by xml * \param walls reordering constraint walls specified by xml */ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels) { //parse XML markup in translation line // no xml tag? we're done. if (line.find_first_of('<') == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (<b>), (is a), (</b>), (test .) vector<string> xmlTokens = TokenizeXml(line); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos])) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += xmlTokens[xmlTokenPos]; // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); VERBOSE(3,"XML TAG IS: " << tag << std::endl); if (tag.size() == 0) { TRACE_ERR("ERROR: empty tag name: " << line << endl); return false; } // check if unary (e.g., "<wall/>") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "<a>", not "</a>")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl); return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl); } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl); return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl ); return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector<string> ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl); return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl); if (startPos >= endPos) { TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl); return false; } WordsRange range(startPos,endPos-1); // specified translations -> vector of phrases // multiple translations may be specified, separated by "||" vector<string> altTexts = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"label"), "||"); CHECK(altTexts.size() == 1); XMLParseOutput item(altTexts[0], range); sourceLabels.push_back(item); } } } // we are done. check if there are tags that are still open if (tagStack.size() > 0) { TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl); return false; } // return de-xml'ed sentence in line line = cleanLine; return true; }
/** * Process a sentence with xml annotation * Xml tags may specifiy additional/replacing translation options * and reordering constraints * * \param line in: sentence, out: sentence without the xml * \param res vector with translation options specified by xml * \param reorderingConstraint reordering constraint zones specified by xml * \param walls reordering constraint walls specified by xml */ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels, std::vector<XmlOption*> &xmlOptions) { //parse XML markup in translation line // no xml tag? we're done. if (line.find_first_of('<') == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (<b>), (is a), (</b>), (test .) vector<string> xmlTokens = TokenizeXml(line); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) // keep this handy for later const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter(); // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos])) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += xmlTokens[xmlTokenPos]; // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); VERBOSE(3,"XML TAG IS: " << tag << std::endl); if (tag.size() == 0) { TRACE_ERR("ERROR: empty tag name: " << line << endl); return false; } // check if unary (e.g., "<wall/>") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "<a>", not "</a>")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl); return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl); } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl); return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl ); return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector<string> ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl); return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl); if (startPos >= endPos) { TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl); return false; } // may be either a input span label ("label"), or a specified output translation "translation" string label = ParseXmlTagAttribute(tagContent,"label"); string translation = ParseXmlTagAttribute(tagContent,"translation"); // specified label if (translation.length() == 0 && label.length() > 0) { WordsRange range(startPos,endPos-1); // really? XMLParseOutput item(label, range); sourceLabels.push_back(item); } // specified translations -> vector of phrases, separated by "||" if (translation.length() > 0 && StaticData::Instance().GetXmlInputType() != XmlIgnore) { vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||"); vector<string> altLabel = TokenizeMultiCharSeparator(label, "||"); vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||"); //TRACE_ERR("number of translations: " << altTexts.size() << endl); for (size_t i=0; i<altTexts.size(); ++i) { // set target phrase TargetPhrase targetPhrase(Output); targetPhrase.CreateFromString(outputFactorOrder,altTexts[i],factorDelimiter); // set constituent label string targetLHSstr; if (altLabel.size() > i && altLabel[i].size() > 0) { targetLHSstr = altLabel[i]; } else { const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS = lhsList.begin(); targetLHSstr = iterLHS->first; } Word targetLHS(true); targetLHS.CreateFromString(Output, outputFactorOrder, targetLHSstr, true); CHECK(targetLHS.GetFactor(0) != NULL); targetPhrase.SetTargetLHS(targetLHS); // get probability float probValue = 1; if (altProbs.size() > i && altProbs[i].size() > 0) { probValue = Scan<float>(altProbs[i]); } // convert from prob to log-prob float scoreValue = FloorScore(TransformScore(probValue)); targetPhrase.SetScore(scoreValue); // set span and create XmlOption WordsRange range(startPos+1,endPos); XmlOption *option = new XmlOption(range,targetPhrase); CHECK(option); xmlOptions.push_back(option); VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl); } altTexts.clear(); altProbs.clear(); } } } } // we are done. check if there are tags that are still open if (tagStack.size() > 0) { TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl); return false; } // return de-xml'ed sentence in line line = cleanLine; return true; }
bool PhraseDictionaryNewFormat::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , std::istream &inStream , const std::vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { PrintUserTime("Start loading new format pt model"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); VERBOSE(2,"PhraseDictionaryNewFormat: input=" << m_inputFactors << " output=" << m_outputFactors << std::endl); string line; size_t count = 0; while(getline(inStream, line)) { vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { stringstream strme; strme << "Syntax error at " << m_filePath << ":" << count; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &alignString = tokens[2] , &scoreString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( m_filePath << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); if (scoreVector.size() != m_numScoreComponent) { stringstream strme; strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } assert(scoreVector.size() == m_numScoreComponent); // parse source & find pt node // head word Word sourceLHS, targetLHS; // source Phrase sourcePhrase(Input); sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(Output); targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS); // alignment list<pair<size_t,size_t> > alignmentInfo; CreateAlignmentInfo(alignmentInfo, alignString); // rest of target phrase targetPhrase->SetAlignmentInfo(alignmentInfo); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->SetScoreChart(GetFeature(), scoreVector, weight, languageModels); // count info for backoff if (tokens.size() >= 6) targetPhrase->CreateCountInfo(tokens[5]); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(sourcePhrase, *targetPhrase); AddEquivPhrase(phraseColl, targetPhrase); count++; } // cleanup cache // sort each target phrase collection m_collection.Sort(m_tableLimit); return true; }
bool RuleTableLoaderStandard::Load(FormatType format , const std::vector<FactorType> &input , const std::vector<FactorType> &output , std::istream &inStream , const std::vector<float> &weight , size_t /* tableLimit */ , const LMList &languageModels , const WordPenaltyProducer* wpProducer , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; while(getline(inStream, lineOrig)) { const string *line; if (format == HieroFormat) { // reformat line line = ReformatHieroRule(lineOrig); } else { // do nothing to format of line line = &lineOrig; } vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, *line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { stringstream strme; strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &scoreString = tokens[2] , &alignString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } CHECK(scoreVector.size() == numScoreComponents); // parse source & find pt node // constituent labels Word sourceLHS, targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(Output); targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; if (format == HieroFormat) { // reformat line delete line; } else { // do nothing } } // sort and prune each target phrase collection SortAndPrune(ruleTable); return true; }