Exemple #1
0
static const char* setSentenceType(const SItem *phrase, SMap *puncMap, s_erc *error)
{
	S_CLR_ERR(error);

	const char* result = NULL;

	/* types: "decl, "excl", "interrog" */
	/* stop at sentence's last token */
	const SItem *wordFromCurrentPhrase = SItemPathToItem(phrase, "daughtern", error);
	if (S_CHK_ERR(error, S_CONTERR,
		      "setSentenceType",
		      "Call to \"SItemPathToItem\" failed"))
		return NULL;

	SItem *wordAsToken = SItemAs(wordFromCurrentPhrase, "Token", error);
	if (S_CHK_ERR(error, S_CONTERR,
		      "setSentenceType",
		      "Call to \"SItemAs\" failed"))
		return NULL;

	SItem *tokenItem = SItemParent(wordAsToken, error);
	tokenItem = SItemNext(tokenItem, error);
	if (S_CHK_ERR(error, S_CONTERR,
			  "setSentenceType",
			  "Call to \"SItemNext\" failed"))
		return NULL;

	const char *punctStr = SItemGetName(tokenItem, error);
	if (S_CHK_ERR(error, S_CONTERR,
			  "setSentenceType",
			  "Call to \"SItemGetName\" failed"))
		return NULL;

	s_bool found= SMapObjectPresent(puncMap, punctStr, error);
	if (S_CHK_ERR(error, S_CONTERR,
			  "setSentenceType",
			  "Call to \"SMapObjectPresent\" failed"))
		return NULL;

	result = punctStr;


	if( found == TRUE)
	{
		result = SMapGetString ( puncMap, punctStr, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "setSentenceType",
				  "Call to \"SMapGetString\" failed"))
			return NULL;
	}
	else
	{
		result = NULL;
	}

	return result;
}
Exemple #2
0
/* setSentenceType should be made out of two parts:
 * 	    1) the first section searchs for the last punctuation element of the sentence
 * 				-> if it is a '.' --> set "decl" type (where should I set this feature value?)
 * 				-> if it is a '!' --> set "excl" type (where should I set this feature value?)
 * 				-> if it is a '?' --> set "interrog" type (where should I set this feature value?)
 * 	    2) if the first part decides for "interrog" type, there should be other controls
 * 	       to establish the sentence's complete type
 * */
static char* setSentenceType(const SItem *phrase, SMap *prosSymbols, s_erc *error)
{
	S_CLR_ERR(error);

	char* result = "decl";

	/* types: "decl, "excl", "interrog" */
	/* stop at sentence's last token */
	const SItem *wordFromCurrentPhrase = SItemPathToItem(phrase, "daughter", error);
	if (S_CHK_ERR(error, S_CONTERR,
		      "setSentenceType",
		      "Call to \"SItemPathToItem\" failed"))
		return NULL;

	SItem *wordAsToken = SItemAs(wordFromCurrentPhrase, "Token", error);
	if (S_CHK_ERR(error, S_CONTERR,
		      "setSentenceType",
		      "Call to \"SItemAs\" failed"))
		return NULL;

	SItem *tokenItem = SItemParent(wordAsToken, error);
	SItem *firstTokenItem = tokenItem;

	s_bool isPunct = SItemFeatureIsPresent(tokenItem, "IsPunctuation", error);
	if (S_CHK_ERR(error, S_CONTERR,
		      "setSentenceType",
		      "Call to \"SItemFeatureIsPresent\" failed"))
		return NULL;

	s_bool isFinalPunct = FALSE;

	while (isFinalPunct == FALSE)
	{
		isPunct = SItemFeatureIsPresent(tokenItem, "IsPunctuation", error);
		if (S_CHK_ERR(error, S_CONTERR,
			      "setSentenceType",
			      "Call to \"SItemFeatureIsPresent\" failed"))
			return NULL;

		if (isPunct)
		{
			const char *punctStr = SItemGetName(tokenItem, error);
			if (S_CHK_ERR(error, S_CONTERR,
				      "setSentenceType",
				      "Call to \"SItemGetName\" failed"))
				return NULL;

			if (s_strcmp(punctStr, ".", error) == 0)
			{
				isFinalPunct = TRUE;
				result = "decl";
			}
			else if (s_strcmp(punctStr, "!", error) == 0)
			{
				isFinalPunct = TRUE;
				result = "excl";
			}
			else if (s_strcmp(punctStr, "?", error) == 0)
			{
				isFinalPunct = TRUE;
				const char *posValueStr = NULL;
				char *posValueStr_filtered = NULL;
				s_bool currPosInCurrList;
				s_bool have_symbols = FALSE;
				SMap* valueMap = NULL;

				have_symbols = SMapObjectPresent(prosSymbols, "firstPosInQuestionW", error);
				if (S_CHK_ERR(error, S_CONTERR,
					      "SetSentenceType",
					      "Call to \"SMapObjectPresent\" failed"))
					goto quit_error;

				if (have_symbols)
				{
					valueMap = S_CAST(SMapGetObject(prosSymbols, "firstPosInQuestionW", error), SMap, error);
					if (S_CHK_ERR(error, S_CONTERR,
						      "SetSentenceType",
						      "Call to \"SMapGetObject\" failed"))
						goto quit_error;
				}
				else
					goto quit_error;

				posValueStr = SItemGetString(firstTokenItem, "POS", error);
				if (S_CHK_ERR(error, S_CONTERR,
					      "SetSentenceType",
					      "Call to \"SItemGetString\" failed"))
					goto quit_error;

				/* filter the current POS tag, remember to free the memory
				 *  pointed to by 'posValueStr_filtered' pointer
                                 */
				posValueStr_filtered = filterPosTag(posValueStr, error);
				if (S_CHK_ERR(error, S_CONTERR,
					      "SetSentenceType",
					      "Call to \"filterPosTag\" failed"))
					goto quit_error;

				currPosInCurrList = searchStringMap(valueMap, posValueStr_filtered, error);
				if (currPosInCurrList == TRUE)
				{
					result = "interrogW";
				}
				else
				{
					result = "interrog";
				}
				quit_error:
					if (posValueStr_filtered)
					{
						S_FREE(posValueStr_filtered);
					}
					break;
			}
		}

		tokenItem = SItemNext(tokenItem, error);
		if (S_CHK_ERR(error, S_CONTERR,
			      "setSentenceType",
			      "Call to \"SItemNext\" failed"))
			return NULL;
		if(tokenItem == NULL) {
			isFinalPunct = TRUE;
		}
	}
	return result;
}
Exemple #3
0
static SObject *Run(const SFeatProcessor *self, const SItem *item,
					s_erc *error)
{
	SObject *extractedFeat = NULL;
	const SItem *itrItem;
	sint32 count;

	SWordsToNextPuncFeatProc *castSelf = S_CAST(self, SWordsToNextPuncFeatProc, error);
	if (S_CHK_ERR(error, S_CONTERR,
		      "Initialize",
		      "Call to S_CAST failed"))
		goto quit_error;

	S_CLR_ERR(error);

	if (item == NULL)
		return NULL;

	itrItem = SItemPathToItem (item, "R:Token.parent", error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SItemPathToItem\" failed"))
		goto quit_error;

	SMap* posPunctuation = S_CAST( SMapGetObject ( castSelf->symbols, "pos punctuation", error ), SMap, error );
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SMapGetObject\" failed"))
		goto quit_error;
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"S_CAST\" failed"))
		goto quit_error;

	count = -1;
	s_bool found = FALSE;

	while (found == FALSE && itrItem != NULL)
	{
		count++;

		s_bool hasPos = SItemFeatureIsPresent ( itrItem, "POS", error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemFeatureIsPresent\" failed"))
			goto quit_error;

		if (hasPos)
		{
			const char* keyPos = SItemGetString (itrItem, "POS", error);
			if (S_CHK_ERR(error, S_CONTERR,
						  "Run",
						  "Call to \"SItemGetString\" failed"))
				goto quit_error;

			found= SMapObjectPresent(posPunctuation, keyPos, error);
			if (S_CHK_ERR(error, S_CONTERR,
						  "Run",
						  "Call to \"SMapObjectPresent\" failed"))
				goto quit_error;
		}

		itrItem = SItemNext(itrItem, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemNext\" failed"))
			goto quit_error;
	}

	extractedFeat = SObjectSetInt(count, error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SObjectSetInt\" failed"))
		goto quit_error;

	/* all OK here */
	return extractedFeat;

	S_UNUSED(self);

	/* error cleanup */
quit_error:
	if (extractedFeat != NULL)
		S_DELETE(extractedFeat, "Run", error);

	return NULL;
}
Exemple #4
0
static SObject *Run(const SFeatProcessor *self, const SItem *item,
					s_erc *error)
{
	SObject *extractedFeat = NULL;
	const SItem *phraseItem;
	const SItem *wordItem;
	const SItem *sylStructWordItem;
	const SItem *syllableItem;
	sint32 num_accented = 0;
	s_bool is_current_syl = FALSE;


	S_CLR_ERR(error);

	if (item == NULL)
		return NULL;

	/* get current phrase */
	phraseItem = SItemPathToItem(item, "R:SylStructure.parent.R:Phrase.parent",
								 error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SItemPathToItem\" failed"))
		goto quit_error;

	if (phraseItem == NULL)
	{
		S_CTX_ERR(error, S_FAILURE,
				  "Run",
				  "Failed to get phrase of given syllable");
		goto quit_error;
	}

	/* last word in phrase */
	wordItem = SItemLastDaughter(phraseItem, error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SItemLastDaughter\" failed"))
		goto quit_error;

	while (wordItem != NULL)
	{
		sylStructWordItem = SItemAs(wordItem, "SylStructure", error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemAs\" failed"))
			goto quit_error;

		/* get syllables */
		syllableItem = SItemLastDaughter(sylStructWordItem, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemLastDaughter\" failed"))
			goto quit_error;

		while (syllableItem != NULL)
		{
			s_bool is_accented;


			is_current_syl = SItemEqual(syllableItem, item, error);
			if (S_CHK_ERR(error, S_CONTERR,
						  "Run",
						  "Call to \"SItemEqual\" failed"))
				goto quit_error;

			if (is_current_syl)
				break;

			is_accented = syl_is_accented(syllableItem, error);
			if (S_CHK_ERR(error, S_CONTERR,
						  "Run",
						  "Call to \"syl_is_accented\" failed"))
				goto quit_error;

			if (is_accented)
				num_accented++;

			syllableItem = SItemPrev(syllableItem, error);
			if (S_CHK_ERR(error, S_CONTERR,
						  "Run",
						  "Call to \"SItemPrev\" failed"))
				goto quit_error;
		}

		if (is_current_syl)
			break;

		wordItem = SItemPrev(wordItem, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemPrev\" failed"))
			goto quit_error;
	}

	extractedFeat = SObjectSetInt(num_accented, error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SObjectSetInt\" failed"))
		goto quit_error;

	/* all OK here */
	return extractedFeat;

	/* error cleanup */
quit_error:
	if (extractedFeat != NULL)
		S_DELETE(extractedFeat, "Run", error);

	return NULL;

	S_UNUSED(self);
}
Exemple #5
0
static const SPhoneset *_get_phoneset(const SItem *item, s_bool *multilingual, s_erc *error)
{
	const SPhoneset *phoneset;
	const SVoice *voice;
	s_bool is_present;


	S_CLR_ERR(error);

	/* get the voice */
	voice = SItemVoice(item, error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "_get_phoneset",
				  "Call to \"SItemVoice\" failed"))
		return NULL;

	if (voice == NULL)
	{
		S_CTX_ERR(error, S_FAILURE,
				  "_get_phoneset",
				  "Item voice is NULL, voice is required to get phoneset");
		return NULL;
	}

	/*
	 * do we have a 'voices' feature in the voice,
	 * i.e. is this a multilingual voice
	 */
	is_present = SVoiceFeatureIsPresent(voice, "voices", error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "_get_phoneset",
				  "Call to \"SVoiceFeatureIsPresent\" failed"))
		return NULL;

	if (is_present)
	{
		/* This is a multilingual voice.
		 * Get language feature of item, which is language feature
		 * of item's token.
		 */
		const SItem *tokenItem;
		const char *lang;
		const SMap *voicesMap;
		const SVoice *thisVoice;


		(*multilingual) = TRUE;
		tokenItem = SItemPathToItem(item, "R:SylStructure.parent.R:Token.parent",
									error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "_get_phoneset",
					  "Call to \"SItemPathToItem\" failed"))
			return NULL;

		if (tokenItem == NULL)
		{
			S_CTX_ERR(error, S_FAILURE,
					  "_get_phoneset",
					  "Failed to find item's token, which is required to get language feature");
			return NULL;
		}

		lang = SItemGetString(tokenItem, "lang", error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "_get_phoneset",
					  "Call to \"SItemGetString\" failed"))
			return NULL;

		/* now get the phoneset */
		voicesMap = (const SMap*)SVoiceGetFeature(voice, "voices", error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "_get_phoneset",
					  "Call to \"SVoiceGetFeature\" failed"))
			return NULL;

		thisVoice = (const SVoice*)SMapGetObjectDef(voicesMap, lang, NULL, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "_get_phoneset",
					  "Call to \"SMapGetObjectDef\" failed"))
			return NULL;

		if (thisVoice == NULL)
		{
			S_CTX_ERR(error, S_FAILURE,
					  "_get_phoneset",
					  "Failed to find the voice for language '%s', which is required to get the phoneset", lang);
			return NULL;
		}

		phoneset = S_PHONESET(SVoiceGetData(thisVoice, "phoneset", error));
		if (S_CHK_ERR(error, S_CONTERR,
					  "_get_phoneset",
					  "Call to \"SVoiceGetData\" failed"))
			return NULL;
	}
	else
	{
		/* not multilingual voice */
		(*multilingual) = FALSE;

		phoneset = S_PHONESET(SVoiceGetData(voice, "phoneset", error));
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SVoiceGetData\" failed"))
			return NULL;
	}

	if (phoneset == NULL)
	{
		S_CTX_ERR(error, S_FAILURE,
				  "_get_phoneset",
				  "Item phoneset is NULL, required to extract phone features");
		return NULL;
	}

	return phoneset;
}
Exemple #6
0
/**
 * Prepare the structure for the next call to hunpos.
 * @private
 *
 * @param relation_head starting SItem for this new phrase. It must be a Phrase SItem if @p is_phrase_present is true, a Token otherwise.
 * @param data data structure to fill with new SItem pointers.
 * @param is_phrase_present tells if we're using phrases or directly tokens
 * @param error Error code.
 *
 */
static void call_hunpos(const SHunposUttProc *hunposProc, const SItem* relation_head, const SItem** data, s_bool is_phrase_present, s_erc *error)
{
	const SItem* phrase_start_item;
	const SItem* current_token;
	const SItem* start_token; /* start point of the next phrase */
	const SItem* last_safe_cut_token; /* last token from where we can cut for the next phrase */
	int last_safe_cut_count;
	int tokens_count;
	S_CLR_ERR(error);

	phrase_start_item = relation_head;

	/* we already have the starting item if we have only tokens, else grab the token */
	start_token = phrase_start_item;
	if (is_phrase_present)
	{
		/* go grab the first token of this phrase */
		start_token = SItemPathToItem(relation_head, "R:Phrase.daughter.R:Token.parent", error);
		if (S_CHK_ERR(error, S_CONTERR,
			      "call_hunpos",
			      "Call to \"SItemPathToItem\" failed"))
			return;
	}

	current_token = start_token;

	/* for each phrase */
	while (current_token != NULL)
	{
		tokens_count = 0;
		last_safe_cut_count = 0;
		start_token = current_token;
		last_safe_cut_token = current_token;

		/* we stop collecting tokens if the next item is null or if we reached the maximum number of tokens allowed */
		while (current_token != NULL && tokens_count < hunposProc->max_tokens_number)
		{
			data[tokens_count] = current_token;

			/* check if it's a safe cut point */
			s_bool is_present = SItemFeatureIsPresent(current_token, "IsPunctuation", error);
			if (S_CHK_ERR(error, S_CONTERR,
				      "call_hunpos",
				      "Call to \"SItemFeatureIsPresent\" failed"))
				return;
			if (is_present)
			{
				/* is it a punctiation token? */
				sint32 is_punctuation = SItemGetInt(current_token, "IsPunctuation", error);
				if (is_punctuation > 0)
				{
					last_safe_cut_token = current_token;
					last_safe_cut_count = tokens_count + 1;
				}
			}

			current_token = SItemNext(current_token, error);
			if (S_CHK_ERR(error, S_CONTERR,
				      "call_hunpos",
				      "Call to \"SItemNext\" failed"))
				return;

			/* if we're using phrases, check if the current phrase finished */
			if (is_phrase_present)
			{
				if (current_token != NULL)
				{
					const SItem* next_token_parent = SItemPathToItem(current_token, "R:Token.daughter.R:Phrase.parent", error);
					if (S_CHK_ERR(error, S_CONTERR,
						      "call_hunpos",
						      "Call to \"SItemPathToItem\" failed"))
						return;
					if (next_token_parent != phrase_start_item)
					{
						/* save the next phrase start and stop the cycle*/
						phrase_start_item = next_token_parent;
						start_token = current_token;
						current_token = NULL;
					}
				}
				else
				{
					phrase_start_item = NULL;
					start_token = NULL;
				}
			}

			tokens_count++;
		}

		/* do we need to cut it? */
		if (current_token != NULL && last_safe_cut_token != start_token)
		{
			tokens_count = last_safe_cut_count;
			current_token = SItemNext(last_safe_cut_token, error);
			if (S_CHK_ERR(error, S_CONTERR,
				      "call_hunpos",
				      "Call to \"SItemNext\" failed"))
				return;
		}

		/* do the tagging */
		int hunpos_error = 0;
		hunpos_tagger_tag(hunposProc->hunpos_instance, tokens_count, data, &read_token, data, &set_tag, &hunpos_error);
		if (hunpos_error !=0)
		{
			S_CTX_ERR(error, S_FAILURE,
				  "call_hunpos",
				  "Call to \"hunpos_tagger_tag\" failed");
			return;
		}

		/* if we're using phrases, go on with the next one */
		if (is_phrase_present && current_token == NULL)
			current_token = start_token;
	}

	return;
}
Exemple #7
0
static void Run(const SUttProcessor *self, SUtterance *utt,
		s_erc *error)
{
	SCrfSuiteUttProc *crfsuiteProc = (SCrfSuiteUttProc*)self;

	crfsuite_model_t * ptr_model = malloc (sizeof(crfsuite_tagger_t));
	crfsuite_tagger_t * ptr_tagger = malloc (sizeof(crfsuite_tagger_t));
	crfsuite_dictionary_t * ptr_attrs = malloc (sizeof(crfsuite_dictionary_t));
	crfsuite_dictionary_t * ptr_labels = malloc (sizeof(crfsuite_dictionary_t));
	crfsuite_instance_t * instance = malloc (sizeof(crfsuite_instance_t));

	/* Initialize model object */
	if ( crfsuite_create_instance_from_file( crfsuiteProc->model_file,
	                                              (void**)&ptr_model
	                                            ) != 0 ) {
		goto exit_cleanup;
	}

	const SRelation* phrase = SUtteranceGetRelation(utt, "Phrase", error);
	if (S_CHK_ERR(error, S_CONTERR,
			  "Run",
			  "Call to \"SUtteranceGetRelation\" failed"))
		return;

	SItem* itrPhrase = SRelationHead( phrase, error );
	if (S_CHK_ERR(error, S_CONTERR,
			  "Run",
			  "Call to \"SRelationHead\" failed"))
		return;

	while ( itrPhrase != NULL )
	{
		/* Obtain the dictionary interface representing the labels in the model. */
		if ( ptr_model->get_labels(ptr_model, &ptr_labels) != 0) {
			goto exit_cleanup;
		}

		/* Obtain the dictionary interface representing the attributes in the model. */
		if ( ptr_model->get_attrs(ptr_model, &ptr_attrs) != 0) {
			goto exit_cleanup;
		}

		/* Obtain the tagger interface. */
		if ( ptr_model->get_tagger(ptr_model, &ptr_tagger) != 0) {
			goto exit_cleanup;
		}

		instance = create_phrase_instance ( itrPhrase, ptr_attrs, ptr_labels, error );
		int *output = calloc(sizeof(int), instance->num_items);
		floatval_t score = 0;

		/* Set the instance to the tagger. */
		if ( ptr_tagger->set(ptr_tagger, instance) != 0) {
			goto exit_cleanup;
		}

		/* Obtain the viterbi label sequence. */
		if (ptr_tagger->viterbi(ptr_tagger, output, &score) != 0) {
			goto exit_cleanup;
		}

		/* Extract the output and insert in the POS attribute */

		const SItem* tokenTMP = SItemPathToItem ( itrPhrase, "daughter.R:Token", error );
		if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SItemPathToItem\" failed"))
			return;

		SItem* token = SItemParent (tokenTMP, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SItemParent\" failed"))
			return;

		const SItem* lastToken = SItemPathToItem ( itrPhrase, "n.daughter.R:Token.parent", error );
		if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SItemPathToItem\" failed"))
			return;

		int i = 0;

		while ( token != NULL && token != lastToken )
		{
			const char * str = malloc (sizeof (char)*16);
			ptr_labels->to_string (ptr_labels, output[i], &str);

			i += 1;

			SItemSetString (token, "POS", str, error);
			if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemSetString\" failed"))
				return;

			token = SItemNext(token, error);
			if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemNext\" failed"))
				return;
		}

		free(output);
		crfsuite_instance_finish(instance);

		itrPhrase = SItemNext(itrPhrase, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SItemNext\" failed"))
			return;
	}

	/* here all is OK */

	S_UNUSED(utt);

exit_cleanup:
	if ( ptr_model != NULL )
		free ( ptr_model );

	if ( ptr_tagger != NULL )
		free ( ptr_tagger );

	if ( ptr_attrs != NULL )
		free ( ptr_attrs );

	if ( ptr_labels != NULL )
		free ( ptr_labels );

	if ( instance != NULL )
		free ( instance );

}
Exemple #8
0
static crfsuite_instance_t* create_phrase_instance ( SItem* phrase,
                                                 crfsuite_dictionary_t* attrs,
                                                 crfsuite_dictionary_t* labels,
                                                 s_erc *error)
{
	crfsuite_instance_t * result = malloc ( sizeof(crfsuite_instance_t) );
	int i = 0;
	int L = labels->num(labels);
	const SItem* itrItem = NULL;
	const SItem* itrItemNext = NULL;
	const SItem* finishItem = NULL;

	const char* lbl[] = {"num", "sym", "cap", "p1", "p2", "p3",
							"s1", "s2", "s3", "P1", "P2", "P3", "P4",
							"S1", "S2", "S3", "S4", "S5", "S6", "w" };

	const int words_length = 19;
	const char* words[19] = {			 NULL, NULL, NULL, NULL, NULL,
										NULL, NULL, NULL, NULL, NULL,
										NULL, NULL, NULL, NULL, NULL,
										NULL, NULL, NULL, NULL};
	int position = 0;

	int lbl_counter = 0;
	char buffer[8192];

	crfsuite_instance_init ( result );

	itrItemNext = SItemPathToItem (phrase, "daughter.R:Token.parent", error);
	if (S_CHK_ERR(error, S_CONTERR,
			  "create_phrase_instance",
			  "Call to \"SItemPathToItem\" failed"))
		return NULL;

	finishItem = SItemPathToItem (phrase, "n.daughter.R:Token.parent", error);
	if (S_CHK_ERR(error, S_CONTERR,
			  "create_phrase_instance",
			  "Call to \"SItemPathToItem\" failed"))
		return NULL;

	int counter = 0;

	while ( itrItemNext != finishItem && itrItemNext != NULL && counter < 9)
	{
		words[counter] = SItemGetName(itrItemNext, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"SItemGetName\" failed"))
			return NULL;

		counter++;

		itrItemNext = SItemNext(itrItemNext, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"SItemNext\" failed"))
			return NULL;
	}

	itrItem = SItemPathToItem (phrase, "daughter.R:Token.parent", error);
	if (S_CHK_ERR(error, S_CONTERR,
			  "create_phrase_instance",
			  "Call to \"SItemPathToItem\" failed"))
		return NULL;

	while ( itrItem != finishItem && itrItem != NULL)
	{
		/* Extraction of the features for each token */
		const char *tokenName = SItemGetName (itrItem, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"SItemGetName\" failed"))
			return NULL;

        lbl_counter = 0;

		/* Extraction of label's ID */
		int tokenID = labels->to_id(labels, "UNK");
		int attribute_id;

		/* If unknown the set the 0 labels (unknown) */
		if (tokenID < 0)
			tokenID = L;

		crfsuite_item_t itemToken;
		crfsuite_attribute_t attribute;

		crfsuite_item_init(&itemToken);

		const char *feat = NULL;
		s_bool found = FALSE;


		/* if token contains numbers */
		found = hasNumber (tokenName, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"hasNumber\" failed"))
			return NULL;

		sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N");
		lbl_counter++;


		attribute_id = attrs->to_id (attrs,buffer);
		crfsuite_attribute_set (&attribute, attribute_id, 1.0);
		crfsuite_item_append_attribute(&itemToken, &attribute);


		/* if token contains symbols */
		found = hasSymbol (tokenName, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"hasSymbol\" failed"))
			return NULL;

		sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N");
		lbl_counter++;

		attribute_id = attrs->to_id (attrs,buffer);
		crfsuite_attribute_set (&attribute, attribute_id, 1.0);
		crfsuite_item_append_attribute(&itemToken, &attribute);


		/* if token contains Capitals */
		found = hasCapital (tokenName, error);
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"hasCapital\" failed"))
			return NULL;

		sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N");
		lbl_counter++;

		attribute_id = attrs->to_id (attrs,buffer);
		crfsuite_attribute_set (&attribute, attribute_id, 1.0);
		crfsuite_item_append_attribute(&itemToken, &attribute);


		/* Prefixes of 1 to 3 chars of token */
		i = 1;
		while ( i < 4 )
		{
			feat = getFirstChars ( tokenName, i, error );
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"getFirstChars\" failed"))
				return NULL;

			if ( feat == NULL )
				feat = "__nil__";

			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat);
			lbl_counter++;

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);

			i += 1;
		}

		/* Suffixes of 1 to 3 chars of token */
		i = 1;
		while ( i < 4 )
		{
			feat = getLastChars ( tokenName, i, error );
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"getLastChars\" failed"))
				return NULL;

			if ( feat == NULL )
				feat = "__nil__";

			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat);
			lbl_counter++;

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);

			i += 1;
		}

		/* Prefixes of 1 to 4 chars of token without duplicates adjacent */
		i = 1;
		while ( i < 5 )
		{
			feat = getFirstChars ( removeDoubles(tokenName, error), i, error );
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"removeDoubles\" failed"))
				return NULL;
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"getFirstChars\" failed"))
				return NULL;

			if ( feat == NULL )
				feat = "__nil__";

			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat);
			lbl_counter++;

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);

			i += 1;
		}

		/* Suffixes of 1 to 6 chars of token without duplicates adjacent */
		i = 1;
		while ( i < 7 )
		{
			feat = getLastChars ( removeDoubles(tokenName, error), i, error );
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"removeDoubles\" failed"))
				return NULL;
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"getFirstChars\" failed"))
				return NULL;

			if ( feat == NULL )
				feat = "__nil__";

			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat);
			lbl_counter++;

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);

			i += 1;
		}

		/* Words features */

		sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, words[(position+0+words_length)%words_length]);

		attribute_id = attrs->to_id (attrs,buffer);
		crfsuite_attribute_set (&attribute, attribute_id, 1.0);
		crfsuite_item_append_attribute(&itemToken, &attribute);

		const char *tmp = words[(position-1+words_length)%words_length];
		if(tmp!=NULL)
		{
			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], -1, tmp);

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);
		}

		tmp = words[(position+1+words_length)%words_length];
		if(tmp!=NULL)
		{
			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 1, tmp);

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);
		}

		tmp = words[(position-2+words_length)%words_length];
		if(tmp!=NULL)
		{
			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], -2, tmp);

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);
		}

		tmp = words[(position+2+words_length)%words_length];
		if(tmp!=NULL)
		{
			sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 2, tmp);

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);
		}

		const char * tmp1 = NULL;
		i = 0;
		while ( i < 4 )
		{
			tmp = words[(position-2+i+words_length)%words_length];
			tmp1 = words[(position-1+i+words_length)%words_length];
			if( tmp!=NULL && tmp1 != NULL )
			{
				sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], -2+i,
													   lbl[lbl_counter], -1+i,
													   tmp, tmp1 );

				attribute_id = attrs->to_id (attrs,buffer);
				crfsuite_attribute_set (&attribute, attribute_id, 1.0);
				crfsuite_item_append_attribute(&itemToken, &attribute);
			}
			i++;
		}

		const char * tmp2 = NULL;
		i = 0;
		while ( i < 3 )
		{
			tmp = words[(position-2+i+words_length)%words_length];
			tmp1 = words[(position-1+i+words_length)%words_length];
			tmp2 = words[(position+0+i+words_length)%words_length];
			if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL )
			{
				sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]=%s|%s|%s",
													   lbl[lbl_counter], -2+i,
													   lbl[lbl_counter], -1+i,
													   lbl[lbl_counter], +0+i,
													   tmp, tmp1, tmp2 );

				attribute_id = attrs->to_id (attrs,buffer);
				crfsuite_attribute_set (&attribute, attribute_id, 1.0);
				crfsuite_item_append_attribute(&itemToken, &attribute);
			}
			i++;
		}

		const char * tmp3 = NULL;
		i = 0;
		while ( i < 2 )
		{
			tmp  = words[(position-2+i+words_length)%words_length];
			tmp1 = words[(position-1+i+words_length)%words_length];
			tmp2 = words[(position+0+i+words_length)%words_length];
			tmp3 = words[(position+1+i+words_length)%words_length];
			if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL && tmp3 != NULL )
			{
				sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]|%s[%d]=%s|%s|%s|%s",
													   lbl[lbl_counter], -2+i,
													   lbl[lbl_counter], -1+i,
													   lbl[lbl_counter], +0+i,
													   lbl[lbl_counter], +1+i,
													   tmp, tmp1, tmp2, tmp3 );

				attribute_id = attrs->to_id (attrs,buffer);
				crfsuite_attribute_set (&attribute, attribute_id, 1.0);
				crfsuite_item_append_attribute(&itemToken, &attribute);
			}
			i++;
		}

		tmp  = words[(position-2+words_length)%words_length];
		tmp1 = words[(position-1+words_length)%words_length];
		tmp2 = words[(position+0+words_length)%words_length];
		tmp3 = words[(position+1+words_length)%words_length];
		const char * tmp4 = words[(position+2+words_length)%words_length];
		if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL && tmp3 != NULL && tmp4 != NULL )
		{
			sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]|%s[%d]|%s[%d]=%s|%s|%s|%s|%s",
												   lbl[lbl_counter], -2,
												   lbl[lbl_counter], -1,
												   lbl[lbl_counter], +0,
												   lbl[lbl_counter], +1,
												   lbl[lbl_counter], +2,
												   tmp, tmp1, tmp2, tmp3, tmp4 );

			attribute_id = attrs->to_id (attrs,buffer);
			crfsuite_attribute_set (&attribute, attribute_id, 1.0);
			crfsuite_item_append_attribute(&itemToken, &attribute);
		}

		tmp  = words[(position+0+words_length)%words_length];

		i = 1;
		while ( i < 10 )
		{
			tmp1  = words[(position-i+words_length)%words_length];
			if ( tmp1 != NULL )
			{
				sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], 0,
													   lbl[lbl_counter], 0-i,
													   tmp, tmp1 );

				attribute_id = attrs->to_id (attrs,buffer);
				crfsuite_attribute_set (&attribute, attribute_id, 1.0);
				crfsuite_item_append_attribute(&itemToken, &attribute);
			}
			i++;
		}

		i = 1;
		while ( i < 10 )
		{
			tmp1  = words[(position+i+words_length)%words_length];
			if ( tmp1 != NULL )
			{
				sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], 0,
													   lbl[lbl_counter], 0+i,
													   tmp, tmp1 );

				attribute_id = attrs->to_id (attrs,buffer);
				crfsuite_attribute_set (&attribute, attribute_id, 1.0);
				crfsuite_item_append_attribute(&itemToken, &attribute);
			}
			i++;
		}




		/* Update words array */
		position++;
		if ( itrItemNext != NULL )
		{
			words[position+9] = SItemGetName (itrItemNext, error);
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"SItemNext\" failed"))
				return NULL;

			itrItemNext = SItemNext ( itrItemNext, error );
			if (S_CHK_ERR(error, S_CONTERR,
					  "create_phrase_instance",
					  "Call to \"SItemNext\" failed"))
				return NULL;
		}
		else
			words[position+9] = NULL;

		itrItem = SItemNext ( itrItem, error );
		if (S_CHK_ERR(error, S_CONTERR,
				  "create_phrase_instance",
				  "Call to \"SItemNext\" failed"))
			return NULL;

		crfsuite_instance_append(result, &itemToken, tokenID);
		crfsuite_item_finish(&itemToken);
	}

	return result;

}
Exemple #9
0
static SObject *Run(const SFeatProcessor *self, const SItem *item,
					s_erc *error)
{
	SObject *extractedFeat = NULL;
	const SItem *phraseItem;
	const SItem *wordItem;
	sint32 num_content = 0;


	S_CLR_ERR(error);

	if (item == NULL)
		return NULL;

	/* get current phrase */
	phraseItem = SItemPathToItem(item, "R:Phrase.parent",
								 error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SItemPathToItem\" failed"))
		goto quit_error;

	if (phraseItem == NULL)
	{
		S_CTX_ERR(error, S_FAILURE,
				  "Run",
				  "Failed to get phrase of given word");
		goto quit_error;
	}

	wordItem = SItemDaughter(phraseItem, error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SItemDaughter\" failed"))
		goto quit_error;

	while (wordItem != NULL)
	{
		s_bool is_content;
		s_bool is_current_word;


		is_current_word = SItemEqual(wordItem, item, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemEqual\" failed"))
			goto quit_error;

		if (is_current_word)
			break;

		is_content = word_is_content(wordItem, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"word_is_content\" failed"))
			goto quit_error;

		if (is_content)
			num_content++;

		wordItem = SItemNext(wordItem, error);
		if (S_CHK_ERR(error, S_CONTERR,
					  "Run",
					  "Call to \"SItemNext\" failed"))
			goto quit_error;
	}

	extractedFeat = SObjectSetInt(num_content, error);
	if (S_CHK_ERR(error, S_CONTERR,
				  "Run",
				  "Call to \"SObjectSetInt\" failed"))
		goto quit_error;

	/* all OK here */
	return extractedFeat;

	/* error cleanup */
quit_error:
	if (extractedFeat != NULL)
		S_DELETE(extractedFeat, "Run", error);

	return NULL;

	S_UNUSED(self);
}