void crfsuite_item_init_n(crfsuite_item_t* item, int num_contents) { crfsuite_item_init(item); item->num_contents = num_contents; item->cap_contents = num_contents; item->contents = (crfsuite_attribute_t*)calloc(num_contents, sizeof(crfsuite_attribute_t)); }
void crfsuite_item_finish(crfsuite_item_t* item) { free(item->contents); crfsuite_item_init(item); }
static crfsuite_instance_t* create_phrase_instance ( SItem* phrase, crfsuite_dictionary_t* attrs, crfsuite_dictionary_t* labels, s_erc *error) { crfsuite_instance_t * result = malloc ( sizeof(crfsuite_instance_t) ); int i = 0; int L = labels->num(labels); const SItem* itrItem = NULL; const SItem* itrItemNext = NULL; const SItem* finishItem = NULL; const char* lbl[] = {"num", "sym", "cap", "p1", "p2", "p3", "s1", "s2", "s3", "P1", "P2", "P3", "P4", "S1", "S2", "S3", "S4", "S5", "S6", "w" }; const int words_length = 19; const char* words[19] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}; int position = 0; int lbl_counter = 0; char buffer[8192]; crfsuite_instance_init ( result ); itrItemNext = SItemPathToItem (phrase, "daughter.R:Token.parent", error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemPathToItem\" failed")) return NULL; finishItem = SItemPathToItem (phrase, "n.daughter.R:Token.parent", error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemPathToItem\" failed")) return NULL; int counter = 0; while ( itrItemNext != finishItem && itrItemNext != NULL && counter < 9) { words[counter] = SItemGetName(itrItemNext, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemGetName\" failed")) return NULL; counter++; itrItemNext = SItemNext(itrItemNext, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemNext\" failed")) return NULL; } itrItem = SItemPathToItem (phrase, "daughter.R:Token.parent", error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemPathToItem\" failed")) return NULL; while ( itrItem != finishItem && itrItem != NULL) { /* Extraction of the features for each token */ const char *tokenName = SItemGetName (itrItem, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemGetName\" failed")) return NULL; lbl_counter = 0; /* Extraction of label's ID */ int tokenID = labels->to_id(labels, "UNK"); int attribute_id; /* If unknown the set the 0 labels (unknown) */ if (tokenID < 0) tokenID = L; crfsuite_item_t itemToken; crfsuite_attribute_t attribute; crfsuite_item_init(&itemToken); const char *feat = NULL; s_bool found = FALSE; /* if token contains numbers */ found = hasNumber (tokenName, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"hasNumber\" failed")) return NULL; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N"); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); /* if token contains symbols */ found = hasSymbol (tokenName, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"hasSymbol\" failed")) return NULL; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N"); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); /* if token contains Capitals */ found = hasCapital (tokenName, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"hasCapital\" failed")) return NULL; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N"); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); /* Prefixes of 1 to 3 chars of token */ i = 1; while ( i < 4 ) { feat = getFirstChars ( tokenName, i, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"getFirstChars\" failed")) return NULL; if ( feat == NULL ) feat = "__nil__"; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); i += 1; } /* Suffixes of 1 to 3 chars of token */ i = 1; while ( i < 4 ) { feat = getLastChars ( tokenName, i, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"getLastChars\" failed")) return NULL; if ( feat == NULL ) feat = "__nil__"; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); i += 1; } /* Prefixes of 1 to 4 chars of token without duplicates adjacent */ i = 1; while ( i < 5 ) { feat = getFirstChars ( removeDoubles(tokenName, error), i, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"removeDoubles\" failed")) return NULL; if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"getFirstChars\" failed")) return NULL; if ( feat == NULL ) feat = "__nil__"; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); i += 1; } /* Suffixes of 1 to 6 chars of token without duplicates adjacent */ i = 1; while ( i < 7 ) { feat = getLastChars ( removeDoubles(tokenName, error), i, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"removeDoubles\" failed")) return NULL; if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"getFirstChars\" failed")) return NULL; if ( feat == NULL ) feat = "__nil__"; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); i += 1; } /* Words features */ sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, words[(position+0+words_length)%words_length]); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); const char *tmp = words[(position-1+words_length)%words_length]; if(tmp!=NULL) { sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], -1, tmp); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } tmp = words[(position+1+words_length)%words_length]; if(tmp!=NULL) { sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 1, tmp); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } tmp = words[(position-2+words_length)%words_length]; if(tmp!=NULL) { sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], -2, tmp); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } tmp = words[(position+2+words_length)%words_length]; if(tmp!=NULL) { sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 2, tmp); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } const char * tmp1 = NULL; i = 0; while ( i < 4 ) { tmp = words[(position-2+i+words_length)%words_length]; tmp1 = words[(position-1+i+words_length)%words_length]; if( tmp!=NULL && tmp1 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], -2+i, lbl[lbl_counter], -1+i, tmp, tmp1 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } i++; } const char * tmp2 = NULL; i = 0; while ( i < 3 ) { tmp = words[(position-2+i+words_length)%words_length]; tmp1 = words[(position-1+i+words_length)%words_length]; tmp2 = words[(position+0+i+words_length)%words_length]; if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]=%s|%s|%s", lbl[lbl_counter], -2+i, lbl[lbl_counter], -1+i, lbl[lbl_counter], +0+i, tmp, tmp1, tmp2 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } i++; } const char * tmp3 = NULL; i = 0; while ( i < 2 ) { tmp = words[(position-2+i+words_length)%words_length]; tmp1 = words[(position-1+i+words_length)%words_length]; tmp2 = words[(position+0+i+words_length)%words_length]; tmp3 = words[(position+1+i+words_length)%words_length]; if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL && tmp3 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]|%s[%d]=%s|%s|%s|%s", lbl[lbl_counter], -2+i, lbl[lbl_counter], -1+i, lbl[lbl_counter], +0+i, lbl[lbl_counter], +1+i, tmp, tmp1, tmp2, tmp3 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } i++; } tmp = words[(position-2+words_length)%words_length]; tmp1 = words[(position-1+words_length)%words_length]; tmp2 = words[(position+0+words_length)%words_length]; tmp3 = words[(position+1+words_length)%words_length]; const char * tmp4 = words[(position+2+words_length)%words_length]; if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL && tmp3 != NULL && tmp4 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]|%s[%d]|%s[%d]=%s|%s|%s|%s|%s", lbl[lbl_counter], -2, lbl[lbl_counter], -1, lbl[lbl_counter], +0, lbl[lbl_counter], +1, lbl[lbl_counter], +2, tmp, tmp1, tmp2, tmp3, tmp4 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } tmp = words[(position+0+words_length)%words_length]; i = 1; while ( i < 10 ) { tmp1 = words[(position-i+words_length)%words_length]; if ( tmp1 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], 0, lbl[lbl_counter], 0-i, tmp, tmp1 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } i++; } i = 1; while ( i < 10 ) { tmp1 = words[(position+i+words_length)%words_length]; if ( tmp1 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], 0, lbl[lbl_counter], 0+i, tmp, tmp1 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } i++; } /* Update words array */ position++; if ( itrItemNext != NULL ) { words[position+9] = SItemGetName (itrItemNext, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemNext\" failed")) return NULL; itrItemNext = SItemNext ( itrItemNext, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemNext\" failed")) return NULL; } else words[position+9] = NULL; itrItem = SItemNext ( itrItem, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemNext\" failed")) return NULL; crfsuite_instance_append(result, &itemToken, tokenID); crfsuite_item_finish(&itemToken); } return result; }
int read_data(FILE *fpi, FILE *fpo, crfsuite_data_t* data, int group) { int i = 0; int n = 0; int lid = -1; int * lids = NULL; crfsuite_instance_t inst; crfsuite_item_t item; crfsuite_attribute_t cont; crfsuite_fuzzy_labels_t fuzzy; iwa_t* iwa = NULL; crfsuite_dictionary_t *attrs = data->attrs; crfsuite_dictionary_t *labels = data->labels; const iwa_token_t *token = NULL; long filesize = 0, begin = 0, offset = 0; int prev = 0, current = 0; /* Initialize the instance.*/ crfsuite_instance_init(&inst); inst.group = group; /* Obtain the file size. */ begin = ftell(fpi); fseek(fpi, 0, SEEK_END); filesize = ftell(fpi) - begin; fseek(fpi, begin, SEEK_SET); /* */ fprintf(fpo, "0"); fflush(fpo); prev = 0; /* allocate a lid from the data*/ iwa = iwa_reader(fpi); while (token = iwa_read(iwa), token != NULL) { /* Progress report. */ offset = ftell(fpi); current = (int)((offset - begin) * 100.0 / (double)filesize); prev = progress(fpo, prev, current); switch (token->type) { case IWA_BOI: /* Initialize an item. */ lid = -1; crfsuite_item_init(&item); break; case IWA_EOI: /* Append the item to the instance. */ if (0 <= lid) { crfsuite_instance_append(&inst, &item, &fuzzy, lid); } crfsuite_item_finish(&item); break; case IWA_ITEM: if (lid == -1) { if (strncmp(token->attr, "@", 1) == 0) { /* Declaration. */ if (strcmp(token->attr, "@weight") == 0) { /* Instance weighting. */ inst.weight = atof(token->value); } else { /* Unrecognized declaration. */ fprintf(fpo, "\n"); fprintf(fpo, "ERROR: unrecognized declaration: %s\n", token->attr); iwa_delete(iwa); return -1; } } else { /* Label. */ crfsuite_fuzzy_labels_init(&fuzzy); parse_fuzzy_labels(token->attr, &fuzzy, labels); if (1 == fuzzy.num_labels) { lid = fuzzy.labels[0]; } else if (1 < fuzzy.num_labels) { lid = 0; /* Just a Pseudo label */ } else { fprintf(stderr, "?\n"); } } } else { crfsuite_attribute_init(&cont); cont.aid = attrs->get(attrs, token->attr); if (token->value && *token->value) { cont.value = atof(token->value); } else { cont.value = 1.0; } crfsuite_item_append_attribute(&item, &cont); } break; case IWA_NONE: case IWA_EOF: /* Put the training instance. */ crfsuite_data_append(data, &inst); crfsuite_instance_finish(&inst); inst.group = group; inst.weight = 1.; ++n; break; } } progress(fpo, prev, 100); fprintf(fpo, "\n"); iwa_delete(iwa); return n; }