void head_finder::operator()(internal_node& inode) { // recurse, as we need the head annotations of all child nodes first inode.each_child([&](node* child) { child->accept(*this); }); if (rules_.find(inode.category()) == rules_.end()) LOG(fatal) << "No rule found for category " << inode.category() << " in rule table" << ENDLG; // run the head finder for the syntactic category of the current node auto idx = rules_.at(inode.category())->find_head(inode); inode.head(inode.child(idx)); if (idx < 2) return; static auto is_punctuation = [](const class_label& cat) { static std::unordered_set<class_label> punctuation = { "''"_cl, "``"_cl, "-LRB-"_cl, "-RRB-"_cl, "."_cl, ":"_cl, ";"_cl}; return punctuation.find(cat) != punctuation.end(); }; // clean up stage for handling coordinating clauses if (inode.child(idx - 1)->category() == "CC"_cl || inode.child(idx - 1)->category() == "CONJP"_cl) { for (uint64_t i = 0; i <= idx - 2; ++i) { auto nidx = idx - 2 - i; auto child = inode.child(nidx); if (child->is_leaf() || !is_punctuation(child->category())) { inode.head(child); return; } } } }
std::unique_ptr<node> binarizer::operator()(const internal_node& in) { auto res = make_unique<internal_node>(in.category()); if (in.num_children() <= 2) { in.each_child([&](const node* child) { res->add_child(child->accept(*this)); if (child == in.head_constituent()) res->head(res->child(res->num_children() - 1)); }); return std::move(res); } auto bin_lbl = class_label{static_cast<std::string>(in.category()) + "*"}; // locate head node auto head = in.head_constituent(); if (!head) throw exception{"Head constituent not labeled"}; uint64_t head_idx = 0; for (uint64_t idx = 0; idx < in.num_children(); ++idx) { if (in.child(idx) == in.head_constituent()) head_idx = idx; } // eat left nodes auto curr = res.get(); for (uint64_t idx = 0; idx < head_idx; ++idx) { curr->add_child(in.child(idx)->accept(*this)); // special case: if the head is the very last node, just add the // remaining child (the head) to the current node if (idx + 1 == head_idx && head_idx == in.num_children() - 1) { curr->add_child(in.child(idx + 1)->accept(*this)); curr->head(curr->child(1)); break; } else { auto bin = make_unique<internal_node>(bin_lbl); auto next = bin.get(); curr->add_child(std::move(bin)); curr->head(curr->child(1)); curr = next; } } // eat right nodes for (uint64_t ridx = in.num_children() - 1; ridx > head_idx; --ridx) { // if the head is the next node, just add it and the current child if (head_idx + 1 == ridx) { curr->add_child(in.child(ridx - 1)->accept(*this)); curr->head(curr->child(0)); curr->add_child(in.child(ridx)->accept(*this)); break; } else { auto bin = make_unique<internal_node>(bin_lbl); auto next = bin.get(); curr->add_child(std::move(bin)); curr->head(curr->child(0)); curr->add_child(in.child(ridx)->accept(*this)); curr = next; } } lexicon_populator pop; res->accept(pop); return std::move(res); }