double PhraseDictionaryMultiModelCounts::ComputeWeightedLexicalTranslation( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, vector<float> &multimodelweights, bool is_input) const { // lexical translation probability double lexScore = 1.0; Word null; if (is_input) { null.CreateFromString(Input, m_input, "NULL", false); } else { null.CreateFromString(Output, m_output, "NULL", false); } // all target words have to be explained for(size_t ti=0; ti<alignment.size(); ti++) { const set< size_t > & srcIndices = alignment[ ti ]; Word t_word = phraseT.GetWord(ti); if (srcIndices.empty()) { // explain unaligned word by NULL lexScore *= GetLexicalProbability( null, t_word, tables, multimodelweights ); } else { // go through all the aligned words to compute average double thisWordScore = 0; for (set< size_t >::const_iterator si(srcIndices.begin()); si != srcIndices.end(); ++si) { Word s_word = phraseS.GetWord(*si); thisWordScore += GetLexicalProbability( s_word, t_word, tables, multimodelweights ); } lexScore *= thisWordScore / srcIndices.size(); } } return lexScore; }
Syntax::F2S::Forest::Vertex *ForestInput::ParseVertex( const StringPiece &s, const std::vector<FactorType>& factorOrder) { using Syntax::F2S::Forest; Word symbol; std::size_t pos = s.rfind('['); if (pos == std::string::npos) { symbol.CreateFromString(Input, factorOrder, s, false); // Create vertex: caller will fill in span. Range span(0, 0); return new Forest::Vertex(Syntax::PVertex(span, symbol)); } symbol.CreateFromString(Input, factorOrder, s.substr(0, pos), true); std::size_t begin = pos + 1; pos = s.find(',', begin+1); std::string tmp; s.substr(begin, pos-begin).CopyToString(&tmp); std::size_t start = std::atoi(tmp.c_str()); s.substr(pos+1, s.size()-pos-2).CopyToString(&tmp); std::size_t end = std::atoi(tmp.c_str()); // Create vertex: offset span by 1 to allow for <s> in first position. Range span(start+1, end+1); return new Forest::Vertex(Syntax::PVertex(span, symbol)); }
void Tokenize(OnDiskPt::Phrase &phrase , const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm , OnDiskPt::OnDiskWrapper &onDiskWrapper) { bool nonTerm = false; size_t tokSize = token.size(); int comStr =token.compare(0, 1, "["); if (comStr == 0) { comStr = token.compare(tokSize - 1, 1, "]"); nonTerm = comStr == 0; } if (nonTerm) { // non-term size_t splitPos = token.find_first_of("[", 2); string wordStr = token.substr(0, splitPos); if (splitPos == string::npos) { // lhs - only 1 word Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } else { // source & target non-terms if (addSourceNonTerm) { Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } wordStr = token.substr(splitPos, tokSize - splitPos); if (addTargetNonTerm) { Word *word = new Word(); word->CreateFromString(wordStr, onDiskWrapper.GetVocab()); phrase.AddWord(word); } } } else { // term Word *word = new Word(); word->CreateFromString(token, onDiskWrapper.GetVocab()); phrase.AddWord(word); } }
void UnknownWordPenalty::Lookup(const std::vector<InputPath*> &inputPathQueue) { Scores *estimatedFutureScore = new Scores(); for (size_t i = 0; i < inputPathQueue.size(); ++i) { InputPath &path = *inputPathQueue[i]; PhraseTableLookup &ptLookup = path.GetPtLookup(m_ptId); const Phrase &source = path.GetPhrase(); if (source.GetSize() == 1) { const Word &sourceWord = source.GetWord(0); string str = sourceWord.ToString(); str = "UNK:" + str + ":UNK"; Word targetWord; targetWord.CreateFromString(str); TargetPhrase *tp = new TargetPhrase(1); tp->Set(0, targetWord); tp->GetScores().Add(*this, LOWEST_SCORE); FeatureFunction::Evaluate(source, *tp, *estimatedFutureScore); TargetPhrases *tpColl = new TargetPhrases(); m_targetPhrases.push_back(tpColl); tpColl->Add(tp); ptLookup.Set(tpColl, NULL); } else { ptLookup.Set(NULL, NULL); } } }
bool Vocab::Load(FileHandler* vcbin, const FactorDirection& direction, const FactorList& factors, bool closed) { // load vocab id -> word mapping m_words2ids.clear(); // reset mapping m_ids2words.clear(); std::string line, word_str; wordID_t id; std::istream &ret = getline(*vcbin, line); UTIL_THROW_IF2(!ret, "Couldn't read file"); std::istringstream first(line.c_str()); uint32_t vcbsize(0); first >> vcbsize; uint32_t loadedsize = 0; while (loadedsize++ < vcbsize && getline(*vcbin, line)) { std::istringstream entry(line.c_str()); entry >> word_str; Word word; word.CreateFromString( direction, factors, word_str, false); // TODO set correctly isNonTerminal entry >> id; // may be no id (i.e. file may just be a word list) if (id == 0 && word != GetkOOVWord()) id = m_ids2words.size() + 1; // assign ids sequentially starting from 1 UTIL_THROW_IF2(m_ids2words.count(id) != 0 || m_words2ids.count(word) != 0, "Error"); m_ids2words[id] = word; m_words2ids[word] = id; } m_closed = closed; // once loaded fix vocab ? std::cerr << "Loaded vocab with " << m_ids2words.size() << " words." << std::endl; return true; }
wordID_t Vocab::GetWordID(const std::string& word_str) { FactorList factors; factors.push_back(0); Word word; word.CreateFromString(Input, factors, word_str, false); return GetWordID(word); }
// get wordID_t index for word represented as string wordID_t Vocab::GetWordID(const std::string& word_str, const FactorDirection& direction, const FactorList& factors, bool isNonTerminal) { // get id for factored string Word word; word.CreateFromString( direction, factors, word_str, isNonTerminal); return GetWordID( word); }
lexicalCache PhraseDictionaryMultiModelCounts::CacheLexicalStatistics( const Phrase &phraseS, const Phrase &phraseT, AlignVector &alignment, const vector<lexicalTable*> &tables, bool is_input ) { //do all the necessary lexical table lookups and get counts, but don't apply weights yet Word null; if (is_input) { null.CreateFromString(Input, m_input, "NULL", false); } else { null.CreateFromString(Output, m_output, "NULL", false); } lexicalCache ret; // all target words have to be explained for(size_t ti=0; ti<alignment.size(); ti++) { const set< size_t > & srcIndices = alignment[ ti ]; Word t_word = phraseT.GetWord(ti); vector<lexicalPair> ti_vector; if (srcIndices.empty()) { // explain unaligned word by NULL vector<float> joint_count (m_numModels); vector<float> marginals (m_numModels); FillLexicalCountsJoint(null, t_word, joint_count, tables); FillLexicalCountsMarginal(null, marginals, tables); ti_vector.push_back(make_pair(joint_count, marginals)); } else { for (set< size_t >::const_iterator si(srcIndices.begin()); si != srcIndices.end(); ++si) { Word s_word = phraseS.GetWord(*si); vector<float> joint_count (m_numModels); vector<float> marginals (m_numModels); FillLexicalCountsJoint(s_word, t_word, joint_count, tables); FillLexicalCountsMarginal(s_word, marginals, tables); ti_vector.push_back(make_pair(joint_count, marginals)); } } ret.push_back(ti_vector); } return ret; }
void Phrase::CreateFromStringNewFormat(FactorDirection direction , const std::vector<FactorType> &factorOrder , const std::string &phraseString , const std::string &factorDelimiter , Word &lhs) { m_arity = 0; // parse vector<string> annotatedWordVector; Tokenize(annotatedWordVector, phraseString); // KOMMA|none ART|Def.Z NN|Neut.NotGen.Sg VVFIN|none // to // "KOMMA|none" "ART|Def.Z" "NN|Neut.NotGen.Sg" "VVFIN|none" for (size_t phrasePos = 0 ; phrasePos < annotatedWordVector.size() - 1 ; phrasePos++) { string &annotatedWord = annotatedWordVector[phrasePos]; bool isNonTerminal; if (annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]") { // non-term isNonTerminal = true; size_t nextPos = annotatedWord.find("[", 1); assert(nextPos != string::npos); if (direction == Input) annotatedWord = annotatedWord.substr(1, nextPos - 2); else annotatedWord = annotatedWord.substr(nextPos + 1, annotatedWord.size() - nextPos - 2); m_arity++; } else { isNonTerminal = false; } Word &word = AddWord(); word.CreateFromString(direction, factorOrder, annotatedWord, isNonTerminal); } // lhs string &annotatedWord = annotatedWordVector.back(); assert(annotatedWord.substr(0, 1) == "[" && annotatedWord.substr(annotatedWord.size()-1, 1) == "]"); annotatedWord = annotatedWord.substr(1, annotatedWord.size() - 2); lhs.CreateFromString(direction, factorOrder, annotatedWord, true); assert(lhs.IsNonTerminal()); }
const Word Vocab::InitSpecialWord( const std::string& word_str) { FactorList factors; factors.push_back(0); // store the special word string as the first factor Word word; // define special word as Input word with one factor and isNonTerminal=false word.CreateFromString( Input, factors, word_str, false ); // Input is enum defined in ../typedef.h // TODO not sure if this will work properly: // - word comparison can fail because the last parameter (isNonTerminal) // in function CreateFromString may not match properly created words // - special word is Input word but what about Output words? // - currently Input/Output variable is not stored in class Word, but in the future??? return word; }
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, const Phrase &sourcePhrase, bool topLevel) { bool extending = tpv->size(); size_t bitsLeft = encodedBitStream.TellFromEnd(); typedef std::pair<size_t, size_t> AlignPointSizeT; std::vector<int> sourceWords; if(m_coding == REnc) { for(size_t i = 0; i < sourcePhrase.GetSize(); i++) { std::string sourceWord = sourcePhrase.GetWord(i).GetString(*m_input, false); unsigned idx = GetSourceSymbolId(sourceWord); sourceWords.push_back(idx); } } unsigned phraseStopSymbol = 0; AlignPoint alignStopSymbol(-1, -1); std::vector<float> scores; std::set<AlignPointSizeT> alignment; enum DecodeState { New, Symbol, Score, Alignment, Add } state = New; size_t srcSize = sourcePhrase.GetSize(); TargetPhrase* targetPhrase = NULL; while(encodedBitStream.TellFromEnd()) { if(state == New) { // Creating new TargetPhrase on the heap tpv->push_back(TargetPhrase(Output)); targetPhrase = &tpv->back(); targetPhrase->SetSourcePhrase(sourcePhrase); alignment.clear(); scores.clear(); state = Symbol; } if(state == Symbol) { unsigned symbol = m_symbolTree->Read(encodedBitStream); if(symbol == phraseStopSymbol) { state = Score; } else { if(m_coding == REnc) { std::string wordString; size_t type = GetREncType(symbol); if(type == 1) { unsigned decodedSymbol = DecodeREncSymbol1(symbol); wordString = GetTargetSymbol(decodedSymbol); } else if (type == 2) { size_t rank = DecodeREncSymbol2Rank(symbol); size_t srcPos = DecodeREncSymbol2Position(symbol); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = targetPhrase->GetSize(); alignment.insert(AlignPoint(srcPos, trgPos)); } } else if(type == 3) { size_t rank = DecodeREncSymbol3(symbol); size_t srcPos = targetPhrase->GetSize(); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = srcPos; alignment.insert(AlignPoint(srcPos, trgPos)); } } Word word; word.CreateFromString(Output, *m_output, wordString, false); targetPhrase->AddWord(word); } else if(m_coding == PREnc) { // if the symbol is just a word if(GetPREncType(symbol) == 1) { unsigned decodedSymbol = DecodePREncSymbol1(symbol); Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(decodedSymbol), false); targetPhrase->AddWord(word); } // if the symbol is a subphrase pointer else { int left = DecodePREncSymbol2Left(symbol); int right = DecodePREncSymbol2Right(symbol); unsigned rank = DecodePREncSymbol2Rank(symbol); int srcStart = left + targetPhrase->GetSize(); int srcEnd = srcSize - right - 1; // false positive consistency check if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize) return TargetPhraseVectorPtr(); // false positive consistency check if(m_maxRank && rank > m_maxRank) return TargetPhraseVectorPtr(); // set subphrase by default to itself TargetPhraseVectorPtr subTpv = tpv; // if range smaller than source phrase retrieve subphrase if(unsigned(srcEnd - srcStart + 1) != srcSize) { Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd)); subTpv = CreateTargetPhraseCollection(subPhrase, false); } // false positive consistency check if(subTpv != NULL && rank < subTpv->size()) { // insert the subphrase into the main target phrase TargetPhrase& subTp = subTpv->at(rank); if(m_phraseDictionary.m_useAlignmentInfo) { // reconstruct the alignment data based on the alignment of the subphrase for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin(); it != subTp.GetAlignmentInfo().end(); it++) { alignment.insert(AlignPointSizeT(srcStart + it->first, targetPhrase->GetSize() + it->second)); } } targetPhrase->Append(subTp); } else return TargetPhraseVectorPtr(); } } else { Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(symbol), false); targetPhrase->AddWord(word); } } } else if(state == Score) { size_t idx = m_multipleScoreTrees ? scores.size() : 0; float score = m_scoreTrees[idx]->Read(encodedBitStream); scores.push_back(score); if(scores.size() == m_numScoreComponent) { targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels); if(m_containsAlignmentInfo) state = Alignment; else state = Add; } } else if(state == Alignment) { AlignPoint alignPoint = m_alignTree->Read(encodedBitStream); if(alignPoint == alignStopSymbol) { state = Add; } else { if(m_phraseDictionary.m_useAlignmentInfo) alignment.insert(AlignPointSizeT(alignPoint)); } } if(state == Add) { if(m_phraseDictionary.m_useAlignmentInfo) targetPhrase->SetAlignmentInfo(alignment); if(m_coding == PREnc) { if(!m_maxRank || tpv->size() <= m_maxRank) bitsLeft = encodedBitStream.TellFromEnd(); if(!topLevel && m_maxRank && tpv->size() >= m_maxRank) break; } if(encodedBitStream.TellFromEnd() <= 8) break; state = New; } } if(m_coding == PREnc && !extending) { bitsLeft = bitsLeft > 8 ? bitsLeft : 0; m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank); } return tpv; }
void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to) { // unknown word, add as trans opt const StaticData &staticData = StaticData::Instance(); const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); size_t isDigit = 0; if (staticData.GetDropUnknown()) { const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface const StringPiece s = f->GetString(); isDigit = s.find_first_of("0123456789"); if (isDigit == string::npos) isDigit = 0; else isDigit = 1; // modify the starting bitmap } Phrase* unksrc = new Phrase(1); unksrc->AddWord() = sourceWord; Word &newWord = unksrc->GetWord(0); newWord.SetIsOOV(true); m_unksrcs.push_back(unksrc); //TranslationOption *transOpt; if (! staticData.GetDropUnknown() || isDigit) { // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; float prob = iterLHS->second; // lhs //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal(); Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); // add to dictionary TargetPhrase *targetPhrase = new TargetPhrase(); Word &targetWord = targetPhrase->AddWord(); targetWord.CreateUnknownWord(sourceWord); // scores float unknownScore = FloorScore(TransformScore(prob)); targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo("0-0"); if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) { targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]"); } // chart rule to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range); } // for (iterLHS } else { // drop source word. create blank trans opt float unknownScore = FloorScore(-numeric_limits<float>::infinity()); TargetPhrase *targetPhrase = new TargetPhrase(); // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; //float prob = iterLHS->second; Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); targetPhrase->SetTargetLHS(targetLHS); // chart rule to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range); } } }
/** * Process a sentence with xml annotation * Xml tags may specifiy additional/replacing translation options * and reordering constraints * * \param line in: sentence, out: sentence without the xml * \param res vector with translation options specified by xml * \param reorderingConstraint reordering constraint zones specified by xml * \param walls reordering constraint walls specified by xml */ bool TreeInput::ProcessAndStripXMLTags(string &line, std::vector<XMLParseOutput> &sourceLabels, std::vector<XmlOption*> &xmlOptions) { //parse XML markup in translation line // no xml tag? we're done. if (line.find_first_of('<') == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (<b>), (is a), (</b>), (test .) vector<string> xmlTokens = TokenizeXml(line); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) // keep this handy for later const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder(); const string &factorDelimiter = StaticData::Instance().GetFactorDelimiter(); // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos])) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += xmlTokens[xmlTokenPos]; // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); VERBOSE(3,"XML TAG IS: " << tag << std::endl); if (tag.size() == 0) { TRACE_ERR("ERROR: empty tag name: " << line << endl); return false; } // check if unary (e.g., "<wall/>") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "<a>", not "</a>")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { TRACE_ERR("ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl); return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl); } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { TRACE_ERR("ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl); return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { TRACE_ERR("ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl ); return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector<string> ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { TRACE_ERR("ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl); return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } VERBOSE(3,"XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl); if (startPos >= endPos) { TRACE_ERR("ERROR: tag " << tagName << " must span at least one word: " << line << endl); return false; } // may be either a input span label ("label"), or a specified output translation "translation" string label = ParseXmlTagAttribute(tagContent,"label"); string translation = ParseXmlTagAttribute(tagContent,"translation"); // specified label if (translation.length() == 0 && label.length() > 0) { WordsRange range(startPos,endPos-1); // really? XMLParseOutput item(label, range); sourceLabels.push_back(item); } // specified translations -> vector of phrases, separated by "||" if (translation.length() > 0 && StaticData::Instance().GetXmlInputType() != XmlIgnore) { vector<string> altTexts = TokenizeMultiCharSeparator(translation, "||"); vector<string> altLabel = TokenizeMultiCharSeparator(label, "||"); vector<string> altProbs = TokenizeMultiCharSeparator(ParseXmlTagAttribute(tagContent,"prob"), "||"); //TRACE_ERR("number of translations: " << altTexts.size() << endl); for (size_t i=0; i<altTexts.size(); ++i) { // set target phrase TargetPhrase targetPhrase; targetPhrase.CreateFromString(Output, outputFactorOrder,altTexts[i],factorDelimiter, NULL); // set constituent label string targetLHSstr; if (altLabel.size() > i && altLabel[i].size() > 0) { targetLHSstr = altLabel[i]; } else { const UnknownLHSList &lhsList = StaticData::Instance().GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS = lhsList.begin(); targetLHSstr = iterLHS->first; } Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, outputFactorOrder, targetLHSstr, true); CHECK(targetLHS->GetFactor(0) != NULL); targetPhrase.SetTargetLHS(targetLHS); // not tested Phrase sourcePhrase = this->GetSubString(WordsRange(startPos,endPos-1)); // get probability float probValue = 1; if (altProbs.size() > i && altProbs[i].size() > 0) { probValue = Scan<float>(altProbs[i]); } // convert from prob to log-prob float scoreValue = FloorScore(TransformScore(probValue)); targetPhrase.SetXMLScore(scoreValue); targetPhrase.Evaluate(sourcePhrase); // set span and create XmlOption WordsRange range(startPos+1,endPos); XmlOption *option = new XmlOption(range,targetPhrase); CHECK(option); xmlOptions.push_back(option); VERBOSE(2,"xml translation = [" << range << "] " << targetLHSstr << " -> " << altTexts[i] << " prob: " << probValue << endl); } altTexts.clear(); altProbs.clear(); } } } } // we are done. check if there are tags that are still open if (tagStack.size() > 0) { TRACE_ERR("ERROR: some opened tags were never closed: " << line << endl); return false; } // return de-xml'ed sentence in line line = cleanLine; return true; }
//! populate this InputType with data from in stream int ForestInput:: Read(std::istream &in, std::vector<FactorType> const& factorOrder, AllOptions const& opts) { using Syntax::F2S::Forest; m_forest = boost::make_shared<Forest>(); m_rootVertex = NULL; m_vertexSet.clear(); std::string line; if (std::getline(in, line, '\n').eof()) { return 0; } // The first line contains the sentence number. We ignore this and skip // straight to the second line, which contains the sentence string. std::string sentence; std::getline(in, sentence); // If the next line is blank then there was a parse failure. Otherwise, // the next line and any subsequent non-blank lines contain hyperedges. std::getline(in, line); if (line == "") { // Parse failure. We treat this as an empty sentence. sentence = ""; // The next line will be blank too. std::getline(in, line); } else { do { ParseHyperedgeLine(line, factorOrder); std::getline(in, line); } while (line != ""); } // Do base class Read(). // TODO Check if this is actually necessary. TreeInput does it, but I'm // not sure ForestInput needs to. std::stringstream strme; strme << "<s> " << sentence << " </s>" << std::endl; Sentence::Read(strme, factorOrder, opts); // Find the maximum end position of any vertex (0 if forest is empty). std::size_t maxEnd = FindMaxEnd(*m_forest); // Determine which vertices are the top vertices. std::vector<Forest::Vertex *> topVertices; if (!m_forest->vertices.empty()) { FindTopVertices(*m_forest, topVertices); assert(topVertices.size() >= 1); } // Add <s> vertex. Forest::Vertex *startSymbol = NULL; { Word symbol; symbol.CreateFromString(Input, factorOrder, "<s>", false); Syntax::PVertex pvertex(Range(0, 0), symbol); startSymbol = new Forest::Vertex(pvertex); m_forest->vertices.push_back(startSymbol); } // Add </s> vertex. Forest::Vertex *endSymbol = NULL; { Word symbol; symbol.CreateFromString(Input, factorOrder, "</s>", false); Syntax::PVertex pvertex(Range(maxEnd+1, maxEnd+1), symbol); endSymbol = new Forest::Vertex(pvertex); m_forest->vertices.push_back(endSymbol); } // Add root vertex. { Word symbol; symbol.CreateFromString(Input, factorOrder, "Q", true); Syntax::PVertex pvertex(Range(0, maxEnd+1), symbol); m_rootVertex = new Forest::Vertex(pvertex); m_forest->vertices.push_back(m_rootVertex); } // Add root's incoming hyperedges. if (topVertices.empty()) { Forest::Hyperedge *e = new Forest::Hyperedge(); e->head = m_rootVertex; e->tail.push_back(startSymbol); e->tail.push_back(endSymbol); m_rootVertex->incoming.push_back(e); } else { // Add a hyperedge between [Q] and each top vertex. for (std::vector<Forest::Vertex *>::const_iterator p = topVertices.begin(); p != topVertices.end(); ++p) { Forest::Hyperedge *e = new Forest::Hyperedge(); e->head = m_rootVertex; e->tail.push_back(startSymbol); e->tail.push_back(*p); e->tail.push_back(endSymbol); m_rootVertex->incoming.push_back(e); } } return 1; }