arabica_document html_parser::_parse(sax_source &is) { typedef Arabica::SAX::Taggle<string_type> Taggle; Arabica::SAX2DOM::Parser<string_type, Taggle> parser; Arabica::SAX::CatchErrorHandler<string_type> eh; parser.setErrorHandler(eh); // Dont explicitly set default attributes. // Without this: // <a href="foo">bar</a> // gets turned into // <a href="foo" shape="rect">bar</a> parser.setFeature(Taggle::defaultAttributesFeature, false); // This one disabled the xmlns attr on the html element parser.setFeature(Taggle::useSchemaNSFeature, false); // This one lets 'inline' contain blocks: <a><div>foo</div></a> parser.setFeature(Taggle::flexibleElementHierarchyFeautre, true); parser.parse(is); if (eh.errorsReported()) { throw exception( eh.errors() ); } return parser.getDocument(); }
arabica_document xml_parser::_parse(sax_source &is) { Arabica::SAX2DOM::Parser<string_type> parser; Arabica::SAX::CatchErrorHandler<string_type> eh; parser.setErrorHandler(eh); parser.parse(is); if (eh.errorsReported()) { throw exception( eh.errors() ); } return parser.getDocument(); }
NistXmlTestset::NistXmlTestset(const std::string &file) : logger_(logkw::channel = "NistXmlTestset") { Arabica::SAX2DOM::Parser<std::string> domParser; Arabica::SAX::InputSource<std::string> is(file); Arabica::SAX::CatchErrorHandler<std::string> errh; domParser.setErrorHandler(errh); domParser.parse(is); if(errh.errorsReported()) BOOST_LOG_SEV(logger_, error) << errh.errors(); Arabica::DOM::Document<std::string> doc = domParser.getDocument(); if(doc == 0) { BOOST_LOG_SEV(logger_, error) << "Error parsing input file: " << file; exit(1); } doc.getDocumentElement().normalize(); Arabica::XPath::XPath<std::string> xp; Arabica::XPath::NodeSet<std::string> docnodes = xp.compile("/mteval/srcset/doc").evaluateAsNodeSet(doc.getDocumentElement()); docnodes.to_document_order(); BOOST_FOREACH(Arabica::DOM::Node<std::string> n, docnodes) documents_.push_back(boost::make_shared<NistXmlDocument>(n)); outdoc_ = static_cast<Arabica::DOM::Document<std::string> >(doc.cloneNode(true)); Arabica::DOM::Element<std::string> srcset = static_cast<Arabica::DOM::Element<std::string> >( xp.compile("/mteval/srcset").evaluateAsNodeSet(outdoc_.getDocumentElement())[0]); Arabica::DOM::Element<std::string> tstset = outdoc_.createElement("tstset"); int docno = 0; while(srcset.hasChildNodes()) { Arabica::DOM::Node<std::string> n = srcset.removeChild(srcset.getFirstChild()); tstset.appendChild(n); if(n.getNodeType() == Arabica::DOM::Node<std::string>::ELEMENT_NODE && n.getNodeName() == "doc") documents_[docno++]->setOutputNode(n); } tstset.setAttribute("setid", srcset.getAttribute("setid")); tstset.setAttribute("srclang", srcset.getAttribute("srclang")); tstset.setAttribute("trglang", "TRGLANG"); tstset.setAttribute("sysid", "SYSID"); srcset.getParentNode().replaceChild(tstset, srcset); }