Example #1
0
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
	is so we can link things up afterwards. We can't create TranslationOptions as we
	parse because we don't have the completed source parsed until after this function
	removes all the markup from it (CreateFromString in Sentence::Read).
*/
bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection, bool unescapeSpecialChars )
{
    //parse XML markup in translation line

    // no xml tag? we're done.
    if (line.find_first_of('<') == string::npos) {
        return true;
    }

    // break up input into a vector of xml tags and text
    // example: (this), (<b>), (is a), (</b>), (test .)
    vector<string> xmlTokens = TokenizeXml(line);

    // we need to store opened tags, until they are closed
    // tags are stored as tripled (tagname, startpos, contents)
    typedef pair< string, pair< size_t, string > > OpenedTag;
    vector< OpenedTag > tagStack; // stack that contains active opened tags

    string cleanLine; // return string (text without xml)
    size_t wordPos = 0; // position in sentence (in terms of number of words)
    bool isLinked = false;

    // loop through the tokens
    for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
        // not a xml tag, but regular text (may contain many words)
        if(!isXmlTag(xmlTokens[xmlTokenPos])) {
            // add a space at boundary, if necessary
            if (cleanLine.size()>0 &&
                    cleanLine[cleanLine.size() - 1] != ' ' &&
                    xmlTokens[xmlTokenPos][0] != ' ') {
                cleanLine += " ";
            }
            // add words to output
            if (unescapeSpecialChars) {
                cleanLine += unescape(xmlTokens[xmlTokenPos]);
            } else {
                cleanLine += xmlTokens[xmlTokenPos];
            }
            wordPos = Tokenize(cleanLine).size(); // count all the words
        }

        // process xml tag
        else {
            // *** get essential information about tag ***

            // strip extra boundary spaces and "<" and ">"
            string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
            // cerr << "XML TAG IS: " << tag << std::endl;

            if (tag.size() == 0) {
                cerr << "ERROR: empty tag name: " << line << endl;
                return false;
            }

            // check if unary (e.g., "<wall/>")
            bool isUnary = ( tag[tag.size() - 1] == '/' );

            // check if opening tag (e.g. "<a>", not "</a>")g
            bool isClosed = ( tag[0] == '/' );
            bool isOpen = !isClosed;

            if (isClosed && isUnary) {
                cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
                return false;
            }

            if (isClosed)
                tag = tag.substr(1); // remove "/" at the beginning
            if (isUnary)
                tag = tag.substr(0,tag.size()-1); // remove "/" at the end

            // find the tag name and contents
            string::size_type endOfName = tag.find_first_of(' ');
            string tagName = tag;
            string tagContent = "";
            if (endOfName != string::npos) {
                tagName = tag.substr(0,endOfName);
                tagContent = tag.substr(endOfName+1);
            }

            // *** process new tag ***

            if (isOpen || isUnary) {
                // put the tag on the tag stack
                OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
                tagStack.push_back( openedTag );
                // cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
            }

            // *** process completed tag ***

            if (isClosed || isUnary) {
                // pop last opened tag from stack;
                if (tagStack.size() == 0) {
                    cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
                    return false;
                }
                OpenedTag openedTag = tagStack.back();
                tagStack.pop_back();

                // tag names have to match
                if (openedTag.first != tagName) {
                    cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
                    return false;
                }

                // assemble remaining information about tag
                size_t startPos = openedTag.second.first;
                string tagContent = openedTag.second.second;
                size_t endPos = wordPos;

                // span attribute overwrites position
                string span = ParseXmlTagAttribute(tagContent,"span");
                if (! span.empty()) {
                    vector<string> ij = Tokenize(span, "-");
                    if (ij.size() != 1 && ij.size() != 2) {
                        cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
                        return false;
                    }
                    startPos = atoi(ij[0].c_str());
                    if (ij.size() == 1) endPos = startPos + 1;
                    else endPos = atoi(ij[1].c_str()) + 1;
                }

                // cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;

                if (startPos >= endPos) {
                    cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl;
                    return false;
                }

                string label = ParseXmlTagAttribute(tagContent,"label");
                labelCollection.insert( label );

                string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
                float pcfgScore = pcfgString == "" ? 0.0f
                                  : std::atof(pcfgString.c_str());

                // report what we have processed so far
                if (0) {
                    cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
                    cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
                    cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
                }
                SyntaxNode *node = tree.AddNode( startPos, endPos-1, label );
                node->SetPcfgScore(pcfgScore);
            }
        }
    }
    // we are done. check if there are tags that are still open
    if (tagStack.size() > 0) {
        cerr << "ERROR: some opened tags were never closed: " << line << endl;
        return false;
    }

    // collect top labels
    const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 );
    for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
        SyntaxNode *n = *node;
        const string &label = n->GetLabel();
        if (topLabelCollection.find( label ) == topLabelCollection.end())
            topLabelCollection[ label ] = 0;
        topLabelCollection[ label ]++;
    }

    // return de-xml'ed sentence in line
    line = cleanLine;
    return true;
}