crf1df_feature_t* crf1df_generate( int *ptr_num_features, dataset_t *ds, int num_labels, int num_attributes, int connect_all_attrs, int connect_all_edges, floatval_t minfreq, crfsuite_logging_callback func, void *instance ) { int c, i, j, s, t; crf1df_feature_t f; crf1df_feature_t *features = NULL; featureset_t* set = NULL; const int N = ds->num_instances; const int L = num_labels; logging_t lg; lg.func = func; lg.instance = instance; lg.percent = 0; /* Create an instance of feature set. */ set = featureset_new(); /* Loop over the sequences in the training data. */ logging_progress_start(&lg); for (s = 0;s < N;++s) { int prev = L, cur = 0; const crfsuite_item_t* item = NULL; const crfsuite_instance_t* seq = dataset_get(ds, s); const int T = seq->num_items; /* Loop over the items in the sequence. */ for (t = 0;t < T;++t) { item = &seq->items[t]; cur = seq->labels[t]; /* Transition feature: label #prev -> label #(item->yid). Features with previous label #L are transition BOS. */ if (prev != L) { f.type = FT_TRANS; f.src = prev; f.dst = cur; f.freq = seq->weight; featureset_add(set, &f); } for (c = 0;c < item->num_contents;++c) { /* State feature: attribute #a -> state #(item->yid). */ f.type = FT_STATE; f.src = item->contents[c].aid; f.dst = cur; f.freq = seq->weight * item->contents[c].value; featureset_add(set, &f); /* Generate state features connecting attributes with all output labels. These features are not unobserved in the training data (zero expexcations). */ if (connect_all_attrs) { for (i = 0;i < L;++i) { f.type = FT_STATE; f.src = item->contents[c].aid; f.dst = i; f.freq = 0; featureset_add(set, &f); } } } prev = cur; } logging_progress(&lg, s * 100 / N); } logging_progress_end(&lg); /* Generate edge features representing all pairs of labels. These features are not unobserved in the training data (zero expexcations). */ if (connect_all_edges) { for (i = 0;i < L;++i) { for (j = 0;j < L;++j) { f.type = FT_TRANS; f.src = i; f.dst = j; f.freq = 0; featureset_add(set, &f); } } } /* Convert the feature set to an feature array. */ features = featureset_generate(ptr_num_features, set, minfreq); /* Delete the feature set. */ featureset_delete(set); return features; }
crf1df_feature_t* crf1df_generate(int *ptr_num_features, \ crf1de_semimarkov_t *sm, \ int *max_items, \ const crf1de_option_t *opt, \ const dataset_t *ds, \ const int ftype, \ const int num_labels, \ const crfsuite_logging_callback func, \ void *instance) { int c, i, j, s, t; int prev, cur, seg_len; crf1df_feature_t f; crf1df_feature_t *features = NULL; featureset_t* set = NULL; const int N = ds->num_instances; const int L = num_labels; const int connect_all_attrs = opt->feature_possible_states ? 1 : 0; const int connect_all_edges = opt->feature_possible_transitions ? 1 : 0; const int minfreq = opt->feature_minfreq; const int max_order = opt->feature_max_order; logging_t lg; lg.func = func; lg.instance = instance; lg.percent = 0; /* Create an instance of feature set. */ set = featureset_new(); /* Initialize semi-markov data storage if needed */ if (ftype == FTYPE_SEMIMCRF && sm->initialize(sm, max_order, opt->feature_max_seg_len, L)) goto final_steps; // auxiliary variables for iteration const crfsuite_item_t* item = NULL; const crfsuite_node_t *node_p = NULL, *chld_node_p = NULL; // iterate over training instances logging_progress_start(&lg); for (s = 0; s < N; ++s) { const crfsuite_instance_t* seq = dataset_get(ds, s); const int T = seq->num_items; if (T > *max_items) *max_items = T; /* reset counters */ prev = L; cur = 0; seg_len = 1; if (ftype == FTYPE_SEMIMCRF) sm->m_ring->reset(sm->m_ring); /* Loop over items in the sequence. */ for (t = 0; t < T; ++t) { item = &seq->items[t]; cur = seq->labels[t]; // Transition feature: label #prev -> label #(item->yid) /* If feature type is tree, add all edges from children to the current node as features (nothing is added for leaves). */ if (ftype == FTYPE_CRF1TREE) { // obtain pointer to node which corresponds to this item assert(item->id >= 0 && item->id < seq->num_items); node_p = &seq->tree[item->id]; // iterate over all children of that node for (int i = 0; i < node_p->num_children; ++i) { f.type = FT_TRANS; chld_node_p = &seq->tree[node_p->children[i]]; f.src = seq->labels[chld_node_p->self_item_id]; f.dst = cur; f.freq = 1; featureset_add(set, &f); } /* In semi-markov model, we generate all possible prefixes and affixes up to and including max order. */ } else if (prev != L) { /* generate transition features for semi-markov model */ if (ftype == FTYPE_SEMIMCRF) { if (prev != cur || sm->m_seg_len_lim >= 0) { sm->update(sm, prev, seg_len); seg_len = 1; } else { ++seg_len; } } else { f.type = FT_TRANS; f.src = prev; f.dst = cur; f.freq = 1; featureset_add(set, &f); } } /* Iterate over state features. */ for (c = 0; c < item->num_contents; ++c) { /* State feature: attribute #a -> state #(item->yid). */ f.type = FT_STATE; f.src = item->contents[c].aid; f.dst = cur; f.freq = item->contents[c].value; featureset_add(set, &f); /* Generate state features connecting attributes with all output labels. These features are not unobserved in the training data (zero expexctations). */ if (connect_all_attrs) { for (i = 0; i < L; ++i) { f.type = FT_STATE; f.src = item->contents[c].aid; f.dst = i; f.freq = 0; featureset_add(set, &f); } } } prev = cur; } if (ftype == FTYPE_SEMIMCRF) sm->update(sm, prev, seg_len); logging_progress(&lg, s * 100 / N); } logging_progress_end(&lg); /* Generate edge features representing all pairs of labels. These features are not unobserved in the training data (zero expexcations). */ if (connect_all_edges) { if (ftype == FTYPE_SEMIMCRF) { sm->generate_all_edges(sm); } else { for (i = 0; i < L; ++i) { for (j = 0; j < L; ++j) { f.type = FT_TRANS; f.src = i; f.dst = j; f.freq = 0; featureset_add(set, &f); } } } } if (ftype == FTYPE_SEMIMCRF && sm->finalize(sm)) goto final_steps; /* Convert feature set to feature array. */ features = featureset_generate(ptr_num_features, set, minfreq, sm); /* Delete the feature set. */ final_steps: featureset_delete(set); return features; }