// Converts a SyntaxNode tree to a Moses::GHKM::ParseTree. std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree( const SyntaxNode &tree, const std::vector<std::string> &words) { std::auto_ptr<ParseTree> root(new ParseTree(tree.GetLabel())); const std::vector<SyntaxNode*> &children = tree.GetChildren(); if (children.empty()) { if (tree.GetStart() != tree.GetEnd()) { std::ostringstream msg; msg << "leaf node covers multiple words (" << tree.GetStart() << "-" << tree.GetEnd() << "): this is currently unsupported"; throw Exception(msg.str()); } std::auto_ptr<ParseTree> leaf(new ParseTree(words[tree.GetStart()])); leaf->SetParent(root.get()); root->AddChild(leaf.release()); } else { for (std::vector<SyntaxNode*>::const_iterator p = children.begin(); p != children.end(); ++p) { assert(*p); std::auto_ptr<ParseTree> child = ConvertTree(**p, words); child->SetParent(root.get()); root->AddChild(child.release()); } } return root; }
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector is so we can link things up afterwards. We can't create TranslationOptions as we parse because we don't have the completed source parsed until after this function removes all the markup from it (CreateFromString in Sentence::Read). */ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection ) { //parse XML markup in translation line // no xml tag? we're done. if (line.find_first_of('<') == string::npos) { return true; } // break up input into a vector of xml tags and text // example: (this), (<b>), (is a), (</b>), (test .) vector<string> xmlTokens = TokenizeXml(line); // we need to store opened tags, until they are closed // tags are stored as tripled (tagname, startpos, contents) typedef pair< string, pair< size_t, string > > OpenedTag; vector< OpenedTag > tagStack; // stack that contains active opened tags string cleanLine; // return string (text without xml) size_t wordPos = 0; // position in sentence (in terms of number of words) bool isLinked = false; // loop through the tokens for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) { // not a xml tag, but regular text (may contain many words) if(!isXmlTag(xmlTokens[xmlTokenPos])) { // add a space at boundary, if necessary if (cleanLine.size()>0 && cleanLine[cleanLine.size() - 1] != ' ' && xmlTokens[xmlTokenPos][0] != ' ') { cleanLine += " "; } cleanLine += unescape(xmlTokens[xmlTokenPos]); // add to output wordPos = Tokenize(cleanLine).size(); // count all the words } // process xml tag else { // *** get essential information about tag *** // strip extra boundary spaces and "<" and ">" string tag = Trim(TrimXml(xmlTokens[xmlTokenPos])); // cerr << "XML TAG IS: " << tag << std::endl; if (tag.size() == 0) { cerr << "ERROR: empty tag name: " << line << endl; return false; } // check if unary (e.g., "<wall/>") bool isUnary = ( tag[tag.size() - 1] == '/' ); // check if opening tag (e.g. "<a>", not "</a>")g bool isClosed = ( tag[0] == '/' ); bool isOpen = !isClosed; if (isClosed && isUnary) { cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl; return false; } if (isClosed) tag = tag.substr(1); // remove "/" at the beginning if (isUnary) tag = tag.substr(0,tag.size()-1); // remove "/" at the end // find the tag name and contents string::size_type endOfName = tag.find_first_of(' '); string tagName = tag; string tagContent = ""; if (endOfName != string::npos) { tagName = tag.substr(0,endOfName); tagContent = tag.substr(endOfName+1); } // *** process new tag *** if (isOpen || isUnary) { // put the tag on the tag stack OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) ); tagStack.push_back( openedTag ); // cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl; } // *** process completed tag *** if (isClosed || isUnary) { // pop last opened tag from stack; if (tagStack.size() == 0) { cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl; return false; } OpenedTag openedTag = tagStack.back(); tagStack.pop_back(); // tag names have to match if (openedTag.first != tagName) { cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl; return false; } // assemble remaining information about tag size_t startPos = openedTag.second.first; string tagContent = openedTag.second.second; size_t endPos = wordPos; // span attribute overwrites position string span = ParseXmlTagAttribute(tagContent,"span"); if (! span.empty()) { vector<string> ij = Tokenize(span, "-"); if (ij.size() != 1 && ij.size() != 2) { cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl; return false; } startPos = atoi(ij[0].c_str()); if (ij.size() == 1) endPos = startPos + 1; else endPos = atoi(ij[1].c_str()) + 1; } // cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl; if (startPos >= endPos) { cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl; return false; } string label = ParseXmlTagAttribute(tagContent,"label"); labelCollection.insert( label ); string scoreString = ParseXmlTagAttribute(tagContent,"score"); float score = scoreString == "" ? 0.0f : std::atof(scoreString.c_str()); // report what we have processed so far if (0) { cerr << "XML TAG NAME IS: '" << tagName << "'" << endl; cerr << "XML TAG LABEL IS: '" << label << "'" << endl; cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; } SyntaxNode *node = tree.AddNode( startPos, endPos-1, label ); node->SetScore(score); } } } // we are done. check if there are tags that are still open if (tagStack.size() > 0) { cerr << "ERROR: some opened tags were never closed: " << line << endl; return false; } // collect top labels const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 ); for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) { SyntaxNode *n = *node; const string &label = n->GetLabel(); if (topLabelCollection.find( label ) == topLabelCollection.end()) topLabelCollection[ label ] = 0; topLabelCollection[ label ]++; } // return de-xml'ed sentence in line line = cleanLine; return true; }