Ejemplo n.º 1
0
crf1df_feature_t* crf1df_generate(
    int *ptr_num_features,
    dataset_t *ds,
    int num_labels,
    int num_attributes,
    int connect_all_attrs,
    int connect_all_edges,
    floatval_t minfreq,
    crfsuite_logging_callback func,
    void *instance
    )
{
    int c, i, j, s, t;
    crf1df_feature_t f;
    crf1df_feature_t *features = NULL;
    featureset_t* set = NULL;
    const int N = ds->num_instances;
    const int L = num_labels;
    logging_t lg;

    lg.func = func;
    lg.instance = instance;
    lg.percent = 0;

    /* Create an instance of feature set. */
    set = featureset_new();

    /* Loop over the sequences in the training data. */
    logging_progress_start(&lg);

    for (s = 0;s < N;++s) {
        int prev = L, cur = 0;
        const crfsuite_item_t* item = NULL;
        const crfsuite_instance_t* seq = dataset_get(ds, s);
        const int T = seq->num_items;

        /* Loop over the items in the sequence. */
        for (t = 0;t < T;++t) {
            item = &seq->items[t];
            cur = seq->labels[t];

            /* Transition feature: label #prev -> label #(item->yid).
               Features with previous label #L are transition BOS. */
            if (prev != L) {
                f.type = FT_TRANS;
                f.src = prev;
                f.dst = cur;
                f.freq = seq->weight;
                featureset_add(set, &f);
            }

            for (c = 0;c < item->num_contents;++c) {
                /* State feature: attribute #a -> state #(item->yid). */
                f.type = FT_STATE;
                f.src = item->contents[c].aid;
                f.dst = cur;
                f.freq = seq->weight * item->contents[c].value;
                featureset_add(set, &f);

                /* Generate state features connecting attributes with all
                   output labels. These features are not unobserved in the
                   training data (zero expexcations). */
                if (connect_all_attrs) {
                    for (i = 0;i < L;++i) {
                        f.type = FT_STATE;
                        f.src = item->contents[c].aid;
                        f.dst = i;
                        f.freq = 0;
                        featureset_add(set, &f);
                    }
                }
            }

            prev = cur;
        }

        logging_progress(&lg, s * 100 / N);
    }
    logging_progress_end(&lg);

    /* Generate edge features representing all pairs of labels.
       These features are not unobserved in the training data
       (zero expexcations). */
    if (connect_all_edges) {
        for (i = 0;i < L;++i) {
            for (j = 0;j < L;++j) {
                f.type = FT_TRANS;
                f.src = i;
                f.dst = j;
                f.freq = 0;
                featureset_add(set, &f);
            }
        }
    }

    /* Convert the feature set to an feature array. */
    features = featureset_generate(ptr_num_features, set, minfreq);

    /* Delete the feature set. */
    featureset_delete(set);

    return features;
}
Ejemplo n.º 2
0
crf1df_feature_t* crf1df_generate(int *ptr_num_features,		\
                                  crf1de_semimarkov_t *sm,		\
                                  int *max_items,			\
                                  const crf1de_option_t *opt,		\
                                  const dataset_t *ds,			\
                                  const int ftype,			\
                                  const int num_labels,			\
                                  const crfsuite_logging_callback func,	\
                                  void *instance)
{
    int c, i, j, s, t;
    int prev, cur, seg_len;

    crf1df_feature_t f;
    crf1df_feature_t *features = NULL;
    featureset_t* set = NULL;

    const int N = ds->num_instances;
    const int L = num_labels;
    const int connect_all_attrs = opt->feature_possible_states ? 1 : 0;
    const int connect_all_edges = opt->feature_possible_transitions ? 1 : 0;
    const int minfreq = opt->feature_minfreq;
    const int max_order = opt->feature_max_order;

    logging_t lg;
    lg.func = func;
    lg.instance = instance;
    lg.percent = 0;

    /* Create an instance of feature set. */
    set = featureset_new();

    /* Initialize semi-markov data storage if needed */
    if (ftype == FTYPE_SEMIMCRF && sm->initialize(sm, max_order, opt->feature_max_seg_len, L))
        goto final_steps;

    // auxiliary variables for iteration
    const crfsuite_item_t* item = NULL;
    const crfsuite_node_t *node_p = NULL, *chld_node_p = NULL;
    // iterate over training instances
    logging_progress_start(&lg);
    for (s = 0; s < N; ++s) {
        const crfsuite_instance_t* seq = dataset_get(ds, s);
        const int T = seq->num_items;
        if (T > *max_items)
            *max_items = T;

        /* reset counters */
        prev = L;
        cur = 0;
        seg_len = 1;
        if (ftype == FTYPE_SEMIMCRF)
            sm->m_ring->reset(sm->m_ring);

        /* Loop over items in the sequence. */
        for (t = 0; t < T; ++t) {
            item = &seq->items[t];
            cur = seq->labels[t];

            // Transition feature: label #prev -> label #(item->yid)

            /* If feature type is tree, add all edges from children to the
            current node as features (nothing is added for leaves). */
            if (ftype == FTYPE_CRF1TREE) {
                // obtain pointer to node which corresponds to this item
                assert(item->id >= 0 && item->id < seq->num_items);
                node_p = &seq->tree[item->id];
                // iterate over all children of that node
                for (int i = 0; i < node_p->num_children; ++i) {
                    f.type = FT_TRANS;
                    chld_node_p = &seq->tree[node_p->children[i]];
                    f.src = seq->labels[chld_node_p->self_item_id];
                    f.dst = cur;
                    f.freq = 1;
                    featureset_add(set, &f);
                }
                /* In semi-markov model, we generate all possible prefixes and affixes
                   up to and including max order. */
            } else if (prev != L) {
                /* generate transition features for semi-markov model */
                if (ftype == FTYPE_SEMIMCRF) {
                    if (prev != cur || sm->m_seg_len_lim >= 0) {
                        sm->update(sm, prev, seg_len);
                        seg_len = 1;
                    } else {
                        ++seg_len;
                    }
                } else {
                    f.type = FT_TRANS;
                    f.src = prev;
                    f.dst = cur;
                    f.freq = 1;
                    featureset_add(set, &f);
                }
            }

            /* Iterate over state features. */
            for (c = 0; c < item->num_contents; ++c) {
                /* State feature: attribute #a -> state #(item->yid). */
                f.type = FT_STATE;
                f.src = item->contents[c].aid;
                f.dst = cur;
                f.freq = item->contents[c].value;
                featureset_add(set, &f);

                /* Generate state features connecting attributes with all
                   output labels. These features are not unobserved in the
                   training data (zero expexctations). */
                if (connect_all_attrs) {
                    for (i = 0; i < L; ++i) {
                        f.type = FT_STATE;
                        f.src = item->contents[c].aid;
                        f.dst = i;
                        f.freq = 0;
                        featureset_add(set, &f);
                    }
                }
            }
            prev = cur;
        }
        if (ftype == FTYPE_SEMIMCRF)
            sm->update(sm, prev, seg_len);

        logging_progress(&lg, s * 100 / N);
    }
    logging_progress_end(&lg);

    /* Generate edge features representing all pairs of labels.
       These features are not unobserved in the training data
       (zero expexcations). */
    if (connect_all_edges) {
        if (ftype == FTYPE_SEMIMCRF) {
            sm->generate_all_edges(sm);
        } else {
            for (i = 0; i < L; ++i) {
                for (j = 0; j < L; ++j) {
                    f.type = FT_TRANS;
                    f.src = i;
                    f.dst = j;
                    f.freq = 0;
                    featureset_add(set, &f);
                }
            }
        }
    }

    if (ftype == FTYPE_SEMIMCRF && sm->finalize(sm))
        goto final_steps;

    /* Convert feature set to feature array. */
    features = featureset_generate(ptr_num_features, set, minfreq, sm);

    /* Delete the feature set. */
final_steps:
    featureset_delete(set);
    return features;
}