예제 #1
0
void NistXmlDocument::annotateSentence(uint sentno, const std::string &annot) {
	typedef Arabica::DOM::Traversal::DocumentTraversal<std::string> Traversal;
	typedef Arabica::DOM::Comment<std::string> Comment;

	Traversal dt = outnode_.getOwnerDocument().createDocumentTraversal();
	SegNodeFilter filter;
	Traversal::TreeWalkerT it = dt.createTreeWalker(outnode_,
					static_cast<unsigned long>(Arabica::DOM::Traversal::SHOW_TEXT),
					filter, true);

	for(uint i = 0; i < sentno; i++)
		assert(it.nextNode() != 0);

	Traversal::NodeT n = it.nextNode(); // the filter finds the next node inside the <seg> element
	assert(n != 0);
	n = n.getParentNode(); // get the <seg>

	Comment comm = n.getOwnerDocument().createComment(" SEG " + annot + " ");

	Traversal::NodeT p = n.getPreviousSibling();
	Traversal::NodeT txt;
	if(p != 0 && p.getNodeType() == Arabica::DOM::Node<std::string>::TEXT_NODE) {
		txt = p;
		p = p.getPreviousSibling();
	}

	if(p != 0 && p.getNodeType() == Arabica::DOM::Node<std::string>::COMMENT_NODE &&
			boost::starts_with(p.getNodeValue(), " SEG "))
		p.getParentNode().replaceChild(comm, p);
	else {
		n.getParentNode().insertBefore(comm, n);
		if(txt != 0)
			n.getParentNode().insertBefore(txt.cloneNode(false), n);
	}
}
예제 #2
0
void NistXmlDocument::setTranslation(const PlainTextDocument &doc) {
	typedef Arabica::DOM::Traversal::DocumentTraversal<std::string> Traversal;

	Traversal dt = outnode_.getOwnerDocument().createDocumentTraversal();
	SegNodeFilter filter;
	Traversal::TreeWalkerT it = dt.createTreeWalker(outnode_,
					static_cast<unsigned long>(Arabica::DOM::Traversal::SHOW_TEXT),
					filter, true);

	uint i = 0;
	for(;;) {
		Traversal::NodeT n = it.nextNode();
		if(n == 0)
			break;
		std::ostringstream os;
		std::copy(doc.sentence_begin(i), doc.sentence_end(i), std::ostream_iterator<Word>(os, " "));
		i++;
		std::string str = os.str();
		str.erase(str.end() - 1);
		n.setNodeValue(str);
	}
}
예제 #3
0
PlainTextDocument NistXmlDocument::asPlainTextDocument() const {
	std::vector<std::vector<Word> > txt;

	typedef Arabica::DOM::Traversal::DocumentTraversal<std::string> Traversal;

	Traversal dt = topnode_.getOwnerDocument().createDocumentTraversal();
	SegNodeFilter filter;
	Traversal::TreeWalkerT it = dt.createTreeWalker(topnode_,
					static_cast<unsigned long>(Arabica::DOM::Traversal::SHOW_TEXT),
					filter, true);

	for(;;) {
		Traversal::NodeT n = it.nextNode();
		if(n == 0)
			break;
		std::string seg = n.getNodeValue();
		boost::trim(seg);
		txt.push_back(std::vector<Word>());
		boost::split(txt.back(), seg, boost::is_any_of(" "));
	}

	return PlainTextDocument(txt);
}
예제 #4
0
boost::shared_ptr<const MMAXDocument> NistXmlDocument::asMMAXDocument() const {
	typedef Arabica::DOM::Traversal::DocumentTraversal<std::string> Traversal;

	Traversal dt = topnode_.getOwnerDocument().createDocumentTraversal();
	SegNodeFilter filter;
	Traversal::TreeWalkerT it = dt.createTreeWalker(topnode_,
					static_cast<unsigned long>(Arabica::DOM::Traversal::SHOW_TEXT),
					filter, true);

	boost::shared_ptr<MMAXDocument> mmax = boost::make_shared<MMAXDocument>();
	for(;;) {
		Traversal::NodeT n = it.nextNode();
		if(n == 0)
			break;
		std::string seg = n.getNodeValue();
		boost::trim(seg);
		std::vector<Word> snt;
		boost::split(snt, seg, boost::is_any_of(" "));
		mmax->addSentence(snt.begin(), snt.end());
	}

	return mmax;
}