Example #1
0
static crfsuite_instance_t* create_phrase_instance ( SItem* phrase,
                                                 crfsuite_dictionary_t* attrs,
                                                 crfsuite_dictionary_t* labels,
                                                 s_erc *error)
{
	crfsuite_instance_t * result = malloc ( sizeof(crfsuite_instance_t) );
	int i = 0;
	int L = labels->num(labels);
	const SItem* itrItem = NULL;
	const SItem* itrItemNext = NULL;
	const SItem* finishItem = NULL;

	const char* lbl[] = {"num", "sym", "cap", "p1", "p2", "p3",
							"s1", "s2", "s3", "P1", "P2", "P3", "P4",
							"S1", "S2", "S3", "S4", "S5", "S6", "w" };

	const int words_length = 19;
	const char* words[19] = {			 NULL, NULL, NULL, NULL, NULL,
										NULL, NULL, NULL, NULL, NULL,
										NULL, NULL, NULL, NULL, NULL,
										NULL, NULL, NULL, NULL};
	int position = 0;

	int lbl_counter = 0;
	char buffer[8192];

	crfsuite_instance_init ( result );

	itrItemNext = SItemPathToItem (phrase, "daughter.R:Token.parent", error);
	if (S_CHK_ERR(error, S_CONTERR,
			  "create_phrase_instance",
			  "Call to \"SItemPathToItem\" failed"))
		return NULL;

	finishItem = SItemPathToItem (phrase, "n.daughter.R:Token.parent", error);
	if (S_CHK_ERR(error, S_CONTERR,
			  "create_phrase_instance",
			  "Call to \"SItemPathToItem\" failed"))
		return NULL;

	int counter = 0;

	while ( itrItemNext != finishItem && itrItemNext != NULL && counter < 9)
	{
		words[counter] = SItemGetName(itrItemNext, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"SItemGetName\" failed"))
			return NULL;

		counter++;

		itrItemNext = SItemNext(itrItemNext, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"SItemNext\" failed"))
			return NULL;
	}

	itrItem = SItemPathToItem (phrase, "daughter.R:Token.parent", error);
	if (S_CHK_ERR(error, S_CONTERR,
			  "create_phrase_instance",
			  "Call to \"SItemPathToItem\" failed"))
		return NULL;

	while ( itrItem != finishItem && itrItem != NULL)
	{
		/* Extraction of the features for each token */
		const char *tokenName = SItemGetName (itrItem, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"SItemGetName\" failed"))
			return NULL;

        lbl_counter = 0;

		/* Extraction of label's ID */
		int tokenID = labels->to_id(labels, "UNK");
		int attribute_id;

		/* If unknown the set the 0 labels (unknown) */
		if (tokenID < 0)
			tokenID = L;

		crfsuite_item_t itemToken;
		crfsuite_attribute_t attribute;

		crfsuite_item_init(&itemToken);

		const char *feat = NULL;
		s_bool found = FALSE;


		/* if token contains numbers */
		found = hasNumber (tokenName, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"hasNumber\" failed"))
			return NULL;

		sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N");
		lbl_counter++;


		attribute_id = attrs->to_id (attrs,buffer);
		crfsuite_attribute_set (&attribute, attribute_id, 1.0);
		crfsuite_item_append_attribute(&itemToken, &attribute);


		/* if token contains symbols */
		found = hasSymbol (tokenName, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"hasSymbol\" failed"))
			return NULL;

		sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N");
		lbl_counter++;

		attribute_id = attrs->to_id (attrs,buffer);
		crfsuite_attribute_set (&attribute, attribute_id, 1.0);
		crfsuite_item_append_attribute(&itemToken, &attribute);


		/* if token contains Capitals */
		found = hasCapital (tokenName, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"hasCapital\" failed"))
			return NULL;

		sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N");
		lbl_counter++;

		attribute_id = attrs->to_id (attrs,buffer);
		crfsuite_attribute_set (&attribute, attribute_id, 1.0);
		crfsuite_item_append_attribute(&itemToken, &attribute);


		/* Prefixes of 1 to 3 chars of token */
		i = 1;
		while ( i < 4 )
		{
			feat = getFirstChars ( tokenName, i, error );
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"getFirstChars\" failed"))
				return NULL;

			if ( feat == NULL )
				feat = "__nil__";

			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat);
			lbl_counter++;

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);

			i += 1;
		}

		/* Suffixes of 1 to 3 chars of token */
		i = 1;
		while ( i < 4 )
		{
			feat = getLastChars ( tokenName, i, error );
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"getLastChars\" failed"))
				return NULL;

			if ( feat == NULL )
				feat = "__nil__";

			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat);
			lbl_counter++;

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);

			i += 1;
		}

		/* Prefixes of 1 to 4 chars of token without duplicates adjacent */
		i = 1;
		while ( i < 5 )
		{
			feat = getFirstChars ( removeDoubles(tokenName, error), i, error );
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"removeDoubles\" failed"))
				return NULL;
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"getFirstChars\" failed"))
				return NULL;

			if ( feat == NULL )
				feat = "__nil__";

			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat);
			lbl_counter++;

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);

			i += 1;
		}

		/* Suffixes of 1 to 6 chars of token without duplicates adjacent */
		i = 1;
		while ( i < 7 )
		{
			feat = getLastChars ( removeDoubles(tokenName, error), i, error );
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"removeDoubles\" failed"))
				return NULL;
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"getFirstChars\" failed"))
				return NULL;

			if ( feat == NULL )
				feat = "__nil__";

			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat);
			lbl_counter++;

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);

			i += 1;
		}

		/* Words features */

		sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, words[(position+0+words_length)%words_length]);

		attribute_id = attrs->to_id (attrs,buffer);
		crfsuite_attribute_set (&attribute, attribute_id, 1.0);
		crfsuite_item_append_attribute(&itemToken, &attribute);

		const char *tmp = words[(position-1+words_length)%words_length];
		if(tmp!=NULL)
		{
			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], -1, tmp);

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);
		}

		tmp = words[(position+1+words_length)%words_length];
		if(tmp!=NULL)
		{
			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 1, tmp);

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);
		}

		tmp = words[(position-2+words_length)%words_length];
		if(tmp!=NULL)
		{
			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], -2, tmp);

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);
		}

		tmp = words[(position+2+words_length)%words_length];
		if(tmp!=NULL)
		{
			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 2, tmp);

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);
		}

		const char * tmp1 = NULL;
		i = 0;
		while ( i < 4 )
		{
			tmp = words[(position-2+i+words_length)%words_length];
			tmp1 = words[(position-1+i+words_length)%words_length];
			if( tmp!=NULL && tmp1 != NULL )
			{
				sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], -2+i,
													   lbl[lbl_counter], -1+i,
													   tmp, tmp1 );

				attribute_id = attrs->to_id (attrs,buffer);
				crfsuite_attribute_set (&attribute, attribute_id, 1.0);
				crfsuite_item_append_attribute(&itemToken, &attribute);
			}
			i++;
		}

		const char * tmp2 = NULL;
		i = 0;
		while ( i < 3 )
		{
			tmp = words[(position-2+i+words_length)%words_length];
			tmp1 = words[(position-1+i+words_length)%words_length];
			tmp2 = words[(position+0+i+words_length)%words_length];
			if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL )
			{
				sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]=%s|%s|%s",
													   lbl[lbl_counter], -2+i,
													   lbl[lbl_counter], -1+i,
													   lbl[lbl_counter], +0+i,
													   tmp, tmp1, tmp2 );

				attribute_id = attrs->to_id (attrs,buffer);
				crfsuite_attribute_set (&attribute, attribute_id, 1.0);
				crfsuite_item_append_attribute(&itemToken, &attribute);
			}
			i++;
		}

		const char * tmp3 = NULL;
		i = 0;
		while ( i < 2 )
		{
			tmp  = words[(position-2+i+words_length)%words_length];
			tmp1 = words[(position-1+i+words_length)%words_length];
			tmp2 = words[(position+0+i+words_length)%words_length];
			tmp3 = words[(position+1+i+words_length)%words_length];
			if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL && tmp3 != NULL )
			{
				sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]|%s[%d]=%s|%s|%s|%s",
													   lbl[lbl_counter], -2+i,
													   lbl[lbl_counter], -1+i,
													   lbl[lbl_counter], +0+i,
													   lbl[lbl_counter], +1+i,
													   tmp, tmp1, tmp2, tmp3 );

				attribute_id = attrs->to_id (attrs,buffer);
				crfsuite_attribute_set (&attribute, attribute_id, 1.0);
				crfsuite_item_append_attribute(&itemToken, &attribute);
			}
			i++;
		}

		tmp  = words[(position-2+words_length)%words_length];
		tmp1 = words[(position-1+words_length)%words_length];
		tmp2 = words[(position+0+words_length)%words_length];
		tmp3 = words[(position+1+words_length)%words_length];
		const char * tmp4 = words[(position+2+words_length)%words_length];
		if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL && tmp3 != NULL && tmp4 != NULL )
		{
			sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]|%s[%d]|%s[%d]=%s|%s|%s|%s|%s",
												   lbl[lbl_counter], -2,
												   lbl[lbl_counter], -1,
												   lbl[lbl_counter], +0,
												   lbl[lbl_counter], +1,
												   lbl[lbl_counter], +2,
												   tmp, tmp1, tmp2, tmp3, tmp4 );

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);
		}

		tmp  = words[(position+0+words_length)%words_length];

		i = 1;
		while ( i < 10 )
		{
			tmp1  = words[(position-i+words_length)%words_length];
			if ( tmp1 != NULL )
			{
				sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], 0,
													   lbl[lbl_counter], 0-i,
													   tmp, tmp1 );

				attribute_id = attrs->to_id (attrs,buffer);
				crfsuite_attribute_set (&attribute, attribute_id, 1.0);
				crfsuite_item_append_attribute(&itemToken, &attribute);
			}
			i++;
		}

		i = 1;
		while ( i < 10 )
		{
			tmp1  = words[(position+i+words_length)%words_length];
			if ( tmp1 != NULL )
			{
				sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], 0,
													   lbl[lbl_counter], 0+i,
													   tmp, tmp1 );

				attribute_id = attrs->to_id (attrs,buffer);
				crfsuite_attribute_set (&attribute, attribute_id, 1.0);
				crfsuite_item_append_attribute(&itemToken, &attribute);
			}
			i++;
		}




		/* Update words array */
		position++;
		if ( itrItemNext != NULL )
		{
			words[position+9] = SItemGetName (itrItemNext, error);
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"SItemNext\" failed"))
				return NULL;

			itrItemNext = SItemNext ( itrItemNext, error );
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"SItemNext\" failed"))
				return NULL;
		}
		else
			words[position+9] = NULL;

		itrItem = SItemNext ( itrItem, error );
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"SItemNext\" failed"))
			return NULL;

		crfsuite_instance_append(result, &itemToken, tokenID);
		crfsuite_item_finish(&itemToken);
	}

	return result;

}
Example #2
0
int read_data(FILE *fpi, FILE *fpo, crfsuite_data_t* data, int group)
{
    int i = 0;
    int n = 0;
    int lid = -1;
    int * lids = NULL;
    crfsuite_instance_t inst;
    crfsuite_item_t item;
    crfsuite_attribute_t cont;
    crfsuite_fuzzy_labels_t fuzzy;
    iwa_t* iwa = NULL;
    crfsuite_dictionary_t *attrs = data->attrs;
    crfsuite_dictionary_t *labels = data->labels;
    const iwa_token_t *token = NULL;
    long filesize = 0, begin = 0, offset = 0;
    int prev = 0, current = 0;

    /* Initialize the instance.*/
    crfsuite_instance_init(&inst);
    inst.group = group;

    /* Obtain the file size. */
    begin = ftell(fpi);
    fseek(fpi, 0, SEEK_END);
    filesize = ftell(fpi) - begin;
    fseek(fpi, begin, SEEK_SET);

    /* */
    fprintf(fpo, "0");
    fflush(fpo);
    prev = 0;

    /* allocate a lid from the data*/
    iwa = iwa_reader(fpi);

    while (token = iwa_read(iwa), token != NULL) {
        /* Progress report. */
        offset = ftell(fpi);
        current = (int)((offset - begin) * 100.0 / (double)filesize);
        prev = progress(fpo, prev, current);

        switch (token->type) {
        case IWA_BOI:
            /* Initialize an item. */
            lid = -1;
            crfsuite_item_init(&item);
            break;
        case IWA_EOI:
            /* Append the item to the instance. */
            if (0 <= lid) {
                crfsuite_instance_append(&inst, &item, &fuzzy, lid);
            }
            crfsuite_item_finish(&item);
            break;
        case IWA_ITEM:
            if (lid == -1) {
                if (strncmp(token->attr, "@", 1) == 0) {
                    /* Declaration. */
                    if (strcmp(token->attr, "@weight") == 0) {
                        /* Instance weighting. */
                        inst.weight = atof(token->value);
                    } else {
                        /* Unrecognized declaration. */
                        fprintf(fpo, "\n");
                        fprintf(fpo, "ERROR: unrecognized declaration: %s\n", token->attr);
                        iwa_delete(iwa);
                        return -1;
                    }
                } else {
                    /* Label. */
                    crfsuite_fuzzy_labels_init(&fuzzy);
                    parse_fuzzy_labels(token->attr, &fuzzy, labels);
                    if (1 == fuzzy.num_labels) {
                        lid = fuzzy.labels[0];
                    } else if (1 < fuzzy.num_labels) {
                        lid = 0; /* Just a Pseudo label */
                    } else {
                        fprintf(stderr, "?\n");
                    }
                }
            } else {
                crfsuite_attribute_init(&cont);
                cont.aid = attrs->get(attrs, token->attr);
                if (token->value && *token->value) {
                    cont.value = atof(token->value);
                } else {
                    cont.value = 1.0;
                }
                crfsuite_item_append_attribute(&item, &cont);
            }
            break;
        case IWA_NONE:
        case IWA_EOF:
            /* Put the training instance. */
            crfsuite_data_append(data, &inst);
            crfsuite_instance_finish(&inst);
            inst.group = group;
            inst.weight = 1.;
            ++n;
            break;
        }
    }

    progress(fpo, prev, 100);
    fprintf(fpo, "\n");

    iwa_delete(iwa);

    return n;
}