Exemple #1
0
/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
	is so we can link things up afterwards. We can't create TranslationOptions as we
	parse because we don't have the completed source parsed until after this function
	removes all the markup from it (CreateFromString in Sentence::Read).
*/
bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection )
{
  //parse XML markup in translation line

  // no xml tag? we're done.
  if (line.find_first_of('<') == string::npos) {
    return true;
  }

  // break up input into a vector of xml tags and text
  // example: (this), (<b>), (is a), (</b>), (test .)
  vector<string> xmlTokens = TokenizeXml(line);

  // we need to store opened tags, until they are closed
  // tags are stored as tripled (tagname, startpos, contents)
  typedef pair< string, pair< size_t, string > > OpenedTag;
  vector< OpenedTag > tagStack; // stack that contains active opened tags

  string cleanLine; // return string (text without xml)
  size_t wordPos = 0; // position in sentence (in terms of number of words)
  bool isLinked = false;

  // loop through the tokens
  for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
    // not a xml tag, but regular text (may contain many words)
    if(!isXmlTag(xmlTokens[xmlTokenPos])) {
      // add a space at boundary, if necessary
      if (cleanLine.size()>0 &&
          cleanLine[cleanLine.size() - 1] != ' ' &&
          xmlTokens[xmlTokenPos][0] != ' ') {
        cleanLine += " ";
      }
      cleanLine += unescape(xmlTokens[xmlTokenPos]); // add to output
      wordPos = Tokenize(cleanLine).size(); // count all the words
    }

    // process xml tag
    else {
      // *** get essential information about tag ***

      // strip extra boundary spaces and "<" and ">"
      string tag =  Trim(TrimXml(xmlTokens[xmlTokenPos]));
      // cerr << "XML TAG IS: " << tag << std::endl;

      if (tag.size() == 0) {
        cerr << "ERROR: empty tag name: " << line << endl;
        return false;
      }

      // check if unary (e.g., "<wall/>")
      bool isUnary = ( tag[tag.size() - 1] == '/' );

      // check if opening tag (e.g. "<a>", not "</a>")g
      bool isClosed = ( tag[0] == '/' );
      bool isOpen = !isClosed;

      if (isClosed && isUnary) {
        cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
        return false;
      }

      if (isClosed)
        tag = tag.substr(1); // remove "/" at the beginning
      if (isUnary)
        tag = tag.substr(0,tag.size()-1); // remove "/" at the end

      // find the tag name and contents
      string::size_type endOfName = tag.find_first_of(' ');
      string tagName = tag;
      string tagContent = "";
      if (endOfName != string::npos) {
        tagName = tag.substr(0,endOfName);
        tagContent = tag.substr(endOfName+1);
      }

      // *** process new tag ***

      if (isOpen || isUnary) {
        // put the tag on the tag stack
        OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
        tagStack.push_back( openedTag );
        // cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
      }

      // *** process completed tag ***

      if (isClosed || isUnary) {
        // pop last opened tag from stack;
        if (tagStack.size() == 0) {
          cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
          return false;
        }
        OpenedTag openedTag = tagStack.back();
        tagStack.pop_back();

        // tag names have to match
        if (openedTag.first != tagName) {
          cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
          return false;
        }

        // assemble remaining information about tag
        size_t startPos = openedTag.second.first;
        string tagContent = openedTag.second.second;
        size_t endPos = wordPos;

        // span attribute overwrites position
        string span = ParseXmlTagAttribute(tagContent,"span");
        if (! span.empty()) {
          vector<string> ij = Tokenize(span, "-");
          if (ij.size() != 1 && ij.size() != 2) {
            cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
            return false;
          }
          startPos = atoi(ij[0].c_str());
          if (ij.size() == 1) endPos = startPos + 1;
          else endPos = atoi(ij[1].c_str()) + 1;
        }

        // cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;

        if (startPos >= endPos) {
          cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl;
          return false;
        }

        string label = ParseXmlTagAttribute(tagContent,"label");
        labelCollection.insert( label );

        string scoreString = ParseXmlTagAttribute(tagContent,"score");
        float score = scoreString == "" ? 0.0f : std::atof(scoreString.c_str());

        // report what we have processed so far
        if (0) {
          cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
          cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
          cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
        }
        SyntaxNode *node = tree.AddNode( startPos, endPos-1, label );
        node->SetScore(score);
      }
    }
  }
  // we are done. check if there are tags that are still open
  if (tagStack.size() > 0) {
    cerr << "ERROR: some opened tags were never closed: " << line << endl;
    return false;
  }

  // collect top labels
  const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 );
  for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
    SyntaxNode *n = *node;
    const string &label = n->GetLabel();
    if (topLabelCollection.find( label ) == topLabelCollection.end())
      topLabelCollection[ label ] = 0;
    topLabelCollection[ label ]++;
  }

  // return de-xml'ed sentence in line
  line = cleanLine;
  return true;
}