void NistXmlDocument::annotateSentence(uint sentno, const std::string &annot) { typedef Arabica::DOM::Traversal::DocumentTraversal<std::string> Traversal; typedef Arabica::DOM::Comment<std::string> Comment; Traversal dt = outnode_.getOwnerDocument().createDocumentTraversal(); SegNodeFilter filter; Traversal::TreeWalkerT it = dt.createTreeWalker(outnode_, static_cast<unsigned long>(Arabica::DOM::Traversal::SHOW_TEXT), filter, true); for(uint i = 0; i < sentno; i++) assert(it.nextNode() != 0); Traversal::NodeT n = it.nextNode(); // the filter finds the next node inside the <seg> element assert(n != 0); n = n.getParentNode(); // get the <seg> Comment comm = n.getOwnerDocument().createComment(" SEG " + annot + " "); Traversal::NodeT p = n.getPreviousSibling(); Traversal::NodeT txt; if(p != 0 && p.getNodeType() == Arabica::DOM::Node<std::string>::TEXT_NODE) { txt = p; p = p.getPreviousSibling(); } if(p != 0 && p.getNodeType() == Arabica::DOM::Node<std::string>::COMMENT_NODE && boost::starts_with(p.getNodeValue(), " SEG ")) p.getParentNode().replaceChild(comm, p); else { n.getParentNode().insertBefore(comm, n); if(txt != 0) n.getParentNode().insertBefore(txt.cloneNode(false), n); } }
void NistXmlDocument::setTranslation(const PlainTextDocument &doc) { typedef Arabica::DOM::Traversal::DocumentTraversal<std::string> Traversal; Traversal dt = outnode_.getOwnerDocument().createDocumentTraversal(); SegNodeFilter filter; Traversal::TreeWalkerT it = dt.createTreeWalker(outnode_, static_cast<unsigned long>(Arabica::DOM::Traversal::SHOW_TEXT), filter, true); uint i = 0; for(;;) { Traversal::NodeT n = it.nextNode(); if(n == 0) break; std::ostringstream os; std::copy(doc.sentence_begin(i), doc.sentence_end(i), std::ostream_iterator<Word>(os, " ")); i++; std::string str = os.str(); str.erase(str.end() - 1); n.setNodeValue(str); } }
PlainTextDocument NistXmlDocument::asPlainTextDocument() const { std::vector<std::vector<Word> > txt; typedef Arabica::DOM::Traversal::DocumentTraversal<std::string> Traversal; Traversal dt = topnode_.getOwnerDocument().createDocumentTraversal(); SegNodeFilter filter; Traversal::TreeWalkerT it = dt.createTreeWalker(topnode_, static_cast<unsigned long>(Arabica::DOM::Traversal::SHOW_TEXT), filter, true); for(;;) { Traversal::NodeT n = it.nextNode(); if(n == 0) break; std::string seg = n.getNodeValue(); boost::trim(seg); txt.push_back(std::vector<Word>()); boost::split(txt.back(), seg, boost::is_any_of(" ")); } return PlainTextDocument(txt); }
boost::shared_ptr<const MMAXDocument> NistXmlDocument::asMMAXDocument() const { typedef Arabica::DOM::Traversal::DocumentTraversal<std::string> Traversal; Traversal dt = topnode_.getOwnerDocument().createDocumentTraversal(); SegNodeFilter filter; Traversal::TreeWalkerT it = dt.createTreeWalker(topnode_, static_cast<unsigned long>(Arabica::DOM::Traversal::SHOW_TEXT), filter, true); boost::shared_ptr<MMAXDocument> mmax = boost::make_shared<MMAXDocument>(); for(;;) { Traversal::NodeT n = it.nextNode(); if(n == 0) break; std::string seg = n.getNodeValue(); boost::trim(seg); std::vector<Word> snt; boost::split(snt, seg, boost::is_any_of(" ")); mmax->addSentence(snt.begin(), snt.end()); } return mmax; }