Example #1
0
void head_finder::operator()(internal_node& inode)
{
    // recurse, as we need the head annotations of all child nodes first
    inode.each_child([&](node* child)
                     {
                         child->accept(*this);
                     });

    if (rules_.find(inode.category()) == rules_.end())
        LOG(fatal) << "No rule found for category " << inode.category()
                   << " in rule table" << ENDLG;

    // run the head finder for the syntactic category of the current node
    auto idx = rules_.at(inode.category())->find_head(inode);
    inode.head(inode.child(idx));

    if (idx < 2)
        return;

    static auto is_punctuation = [](const class_label& cat)
    {
        static std::unordered_set<class_label> punctuation = {
            "''"_cl, "``"_cl, "-LRB-"_cl, "-RRB-"_cl, "."_cl, ":"_cl, ";"_cl};

        return punctuation.find(cat) != punctuation.end();
    };

    // clean up stage for handling coordinating clauses
    if (inode.child(idx - 1)->category() == "CC"_cl
        || inode.child(idx - 1)->category() == "CONJP"_cl)
    {
        for (uint64_t i = 0; i <= idx - 2; ++i)
        {
            auto nidx = idx - 2 - i;
            auto child = inode.child(nidx);
            if (child->is_leaf() || !is_punctuation(child->category()))
            {
                inode.head(child);
                return;
            }
        }
    }
}
Example #2
0
std::unique_ptr<node> binarizer::operator()(const internal_node& in)
{
    auto res = make_unique<internal_node>(in.category());

    if (in.num_children() <= 2)
    {
        in.each_child([&](const node* child)
                      {
                          res->add_child(child->accept(*this));
                          if (child == in.head_constituent())
                              res->head(res->child(res->num_children() - 1));
                      });

        return std::move(res);
    }

    auto bin_lbl = class_label{static_cast<std::string>(in.category()) + "*"};

    // locate head node
    auto head = in.head_constituent();
    if (!head)
        throw exception{"Head constituent not labeled"};

    uint64_t head_idx = 0;
    for (uint64_t idx = 0; idx < in.num_children(); ++idx)
    {
        if (in.child(idx) == in.head_constituent())
            head_idx = idx;
    }

    // eat left nodes
    auto curr = res.get();
    for (uint64_t idx = 0; idx < head_idx; ++idx)
    {
        curr->add_child(in.child(idx)->accept(*this));

        // special case: if the head is the very last node, just add the
        // remaining child (the head) to the current node
        if (idx + 1 == head_idx && head_idx == in.num_children() - 1)
        {
            curr->add_child(in.child(idx + 1)->accept(*this));
            curr->head(curr->child(1));
            break;
        }
        else
        {
            auto bin = make_unique<internal_node>(bin_lbl);
            auto next = bin.get();

            curr->add_child(std::move(bin));
            curr->head(curr->child(1));

            curr = next;
        }
    }

    // eat right nodes
    for (uint64_t ridx = in.num_children() - 1; ridx > head_idx; --ridx)
    {
        // if the head is the next node, just add it and the current child
        if (head_idx + 1 == ridx)
        {
            curr->add_child(in.child(ridx - 1)->accept(*this));
            curr->head(curr->child(0));
            curr->add_child(in.child(ridx)->accept(*this));
            break;
        }
        else
        {
            auto bin = make_unique<internal_node>(bin_lbl);
            auto next = bin.get();

            curr->add_child(std::move(bin));
            curr->head(curr->child(0));
            curr->add_child(in.child(ridx)->accept(*this));

            curr = next;
        }
    }

    lexicon_populator pop;
    res->accept(pop);
    return std::move(res);
}