static const char* setSentenceType(const SItem *phrase, SMap *puncMap, s_erc *error) { S_CLR_ERR(error); const char* result = NULL; /* types: "decl, "excl", "interrog" */ /* stop at sentence's last token */ const SItem *wordFromCurrentPhrase = SItemPathToItem(phrase, "daughtern", error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SItemPathToItem\" failed")) return NULL; SItem *wordAsToken = SItemAs(wordFromCurrentPhrase, "Token", error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SItemAs\" failed")) return NULL; SItem *tokenItem = SItemParent(wordAsToken, error); tokenItem = SItemNext(tokenItem, error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SItemNext\" failed")) return NULL; const char *punctStr = SItemGetName(tokenItem, error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SItemGetName\" failed")) return NULL; s_bool found= SMapObjectPresent(puncMap, punctStr, error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SMapObjectPresent\" failed")) return NULL; result = punctStr; if( found == TRUE) { result = SMapGetString ( puncMap, punctStr, error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SMapGetString\" failed")) return NULL; } else { result = NULL; } return result; }
/* setSentenceType should be made out of two parts: * 1) the first section searchs for the last punctuation element of the sentence * -> if it is a '.' --> set "decl" type (where should I set this feature value?) * -> if it is a '!' --> set "excl" type (where should I set this feature value?) * -> if it is a '?' --> set "interrog" type (where should I set this feature value?) * 2) if the first part decides for "interrog" type, there should be other controls * to establish the sentence's complete type * */ static char* setSentenceType(const SItem *phrase, SMap *prosSymbols, s_erc *error) { S_CLR_ERR(error); char* result = "decl"; /* types: "decl, "excl", "interrog" */ /* stop at sentence's last token */ const SItem *wordFromCurrentPhrase = SItemPathToItem(phrase, "daughter", error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SItemPathToItem\" failed")) return NULL; SItem *wordAsToken = SItemAs(wordFromCurrentPhrase, "Token", error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SItemAs\" failed")) return NULL; SItem *tokenItem = SItemParent(wordAsToken, error); SItem *firstTokenItem = tokenItem; s_bool isPunct = SItemFeatureIsPresent(tokenItem, "IsPunctuation", error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SItemFeatureIsPresent\" failed")) return NULL; s_bool isFinalPunct = FALSE; while (isFinalPunct == FALSE) { isPunct = SItemFeatureIsPresent(tokenItem, "IsPunctuation", error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SItemFeatureIsPresent\" failed")) return NULL; if (isPunct) { const char *punctStr = SItemGetName(tokenItem, error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SItemGetName\" failed")) return NULL; if (s_strcmp(punctStr, ".", error) == 0) { isFinalPunct = TRUE; result = "decl"; } else if (s_strcmp(punctStr, "!", error) == 0) { isFinalPunct = TRUE; result = "excl"; } else if (s_strcmp(punctStr, "?", error) == 0) { isFinalPunct = TRUE; const char *posValueStr = NULL; char *posValueStr_filtered = NULL; s_bool currPosInCurrList; s_bool have_symbols = FALSE; SMap* valueMap = NULL; have_symbols = SMapObjectPresent(prosSymbols, "firstPosInQuestionW", error); if (S_CHK_ERR(error, S_CONTERR, "SetSentenceType", "Call to \"SMapObjectPresent\" failed")) goto quit_error; if (have_symbols) { valueMap = S_CAST(SMapGetObject(prosSymbols, "firstPosInQuestionW", error), SMap, error); if (S_CHK_ERR(error, S_CONTERR, "SetSentenceType", "Call to \"SMapGetObject\" failed")) goto quit_error; } else goto quit_error; posValueStr = SItemGetString(firstTokenItem, "POS", error); if (S_CHK_ERR(error, S_CONTERR, "SetSentenceType", "Call to \"SItemGetString\" failed")) goto quit_error; /* filter the current POS tag, remember to free the memory * pointed to by 'posValueStr_filtered' pointer */ posValueStr_filtered = filterPosTag(posValueStr, error); if (S_CHK_ERR(error, S_CONTERR, "SetSentenceType", "Call to \"filterPosTag\" failed")) goto quit_error; currPosInCurrList = searchStringMap(valueMap, posValueStr_filtered, error); if (currPosInCurrList == TRUE) { result = "interrogW"; } else { result = "interrog"; } quit_error: if (posValueStr_filtered) { S_FREE(posValueStr_filtered); } break; } } tokenItem = SItemNext(tokenItem, error); if (S_CHK_ERR(error, S_CONTERR, "setSentenceType", "Call to \"SItemNext\" failed")) return NULL; if(tokenItem == NULL) { isFinalPunct = TRUE; } } return result; }
static SObject *Run(const SFeatProcessor *self, const SItem *item, s_erc *error) { SObject *extractedFeat = NULL; const SItem *itrItem; sint32 count; SWordsToNextPuncFeatProc *castSelf = S_CAST(self, SWordsToNextPuncFeatProc, error); if (S_CHK_ERR(error, S_CONTERR, "Initialize", "Call to S_CAST failed")) goto quit_error; S_CLR_ERR(error); if (item == NULL) return NULL; itrItem = SItemPathToItem (item, "R:Token.parent", error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemPathToItem\" failed")) goto quit_error; SMap* posPunctuation = S_CAST( SMapGetObject ( castSelf->symbols, "pos punctuation", error ), SMap, error ); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SMapGetObject\" failed")) goto quit_error; if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"S_CAST\" failed")) goto quit_error; count = -1; s_bool found = FALSE; while (found == FALSE && itrItem != NULL) { count++; s_bool hasPos = SItemFeatureIsPresent ( itrItem, "POS", error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemFeatureIsPresent\" failed")) goto quit_error; if (hasPos) { const char* keyPos = SItemGetString (itrItem, "POS", error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemGetString\" failed")) goto quit_error; found= SMapObjectPresent(posPunctuation, keyPos, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SMapObjectPresent\" failed")) goto quit_error; } itrItem = SItemNext(itrItem, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemNext\" failed")) goto quit_error; } extractedFeat = SObjectSetInt(count, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SObjectSetInt\" failed")) goto quit_error; /* all OK here */ return extractedFeat; S_UNUSED(self); /* error cleanup */ quit_error: if (extractedFeat != NULL) S_DELETE(extractedFeat, "Run", error); return NULL; }
static SObject *Run(const SFeatProcessor *self, const SItem *item, s_erc *error) { SObject *extractedFeat = NULL; const SItem *phraseItem; const SItem *wordItem; const SItem *sylStructWordItem; const SItem *syllableItem; sint32 num_accented = 0; s_bool is_current_syl = FALSE; S_CLR_ERR(error); if (item == NULL) return NULL; /* get current phrase */ phraseItem = SItemPathToItem(item, "R:SylStructure.parent.R:Phrase.parent", error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemPathToItem\" failed")) goto quit_error; if (phraseItem == NULL) { S_CTX_ERR(error, S_FAILURE, "Run", "Failed to get phrase of given syllable"); goto quit_error; } /* last word in phrase */ wordItem = SItemLastDaughter(phraseItem, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemLastDaughter\" failed")) goto quit_error; while (wordItem != NULL) { sylStructWordItem = SItemAs(wordItem, "SylStructure", error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemAs\" failed")) goto quit_error; /* get syllables */ syllableItem = SItemLastDaughter(sylStructWordItem, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemLastDaughter\" failed")) goto quit_error; while (syllableItem != NULL) { s_bool is_accented; is_current_syl = SItemEqual(syllableItem, item, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemEqual\" failed")) goto quit_error; if (is_current_syl) break; is_accented = syl_is_accented(syllableItem, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"syl_is_accented\" failed")) goto quit_error; if (is_accented) num_accented++; syllableItem = SItemPrev(syllableItem, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemPrev\" failed")) goto quit_error; } if (is_current_syl) break; wordItem = SItemPrev(wordItem, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemPrev\" failed")) goto quit_error; } extractedFeat = SObjectSetInt(num_accented, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SObjectSetInt\" failed")) goto quit_error; /* all OK here */ return extractedFeat; /* error cleanup */ quit_error: if (extractedFeat != NULL) S_DELETE(extractedFeat, "Run", error); return NULL; S_UNUSED(self); }
static const SPhoneset *_get_phoneset(const SItem *item, s_bool *multilingual, s_erc *error) { const SPhoneset *phoneset; const SVoice *voice; s_bool is_present; S_CLR_ERR(error); /* get the voice */ voice = SItemVoice(item, error); if (S_CHK_ERR(error, S_CONTERR, "_get_phoneset", "Call to \"SItemVoice\" failed")) return NULL; if (voice == NULL) { S_CTX_ERR(error, S_FAILURE, "_get_phoneset", "Item voice is NULL, voice is required to get phoneset"); return NULL; } /* * do we have a 'voices' feature in the voice, * i.e. is this a multilingual voice */ is_present = SVoiceFeatureIsPresent(voice, "voices", error); if (S_CHK_ERR(error, S_CONTERR, "_get_phoneset", "Call to \"SVoiceFeatureIsPresent\" failed")) return NULL; if (is_present) { /* This is a multilingual voice. * Get language feature of item, which is language feature * of item's token. */ const SItem *tokenItem; const char *lang; const SMap *voicesMap; const SVoice *thisVoice; (*multilingual) = TRUE; tokenItem = SItemPathToItem(item, "R:SylStructure.parent.R:Token.parent", error); if (S_CHK_ERR(error, S_CONTERR, "_get_phoneset", "Call to \"SItemPathToItem\" failed")) return NULL; if (tokenItem == NULL) { S_CTX_ERR(error, S_FAILURE, "_get_phoneset", "Failed to find item's token, which is required to get language feature"); return NULL; } lang = SItemGetString(tokenItem, "lang", error); if (S_CHK_ERR(error, S_CONTERR, "_get_phoneset", "Call to \"SItemGetString\" failed")) return NULL; /* now get the phoneset */ voicesMap = (const SMap*)SVoiceGetFeature(voice, "voices", error); if (S_CHK_ERR(error, S_CONTERR, "_get_phoneset", "Call to \"SVoiceGetFeature\" failed")) return NULL; thisVoice = (const SVoice*)SMapGetObjectDef(voicesMap, lang, NULL, error); if (S_CHK_ERR(error, S_CONTERR, "_get_phoneset", "Call to \"SMapGetObjectDef\" failed")) return NULL; if (thisVoice == NULL) { S_CTX_ERR(error, S_FAILURE, "_get_phoneset", "Failed to find the voice for language '%s', which is required to get the phoneset", lang); return NULL; } phoneset = S_PHONESET(SVoiceGetData(thisVoice, "phoneset", error)); if (S_CHK_ERR(error, S_CONTERR, "_get_phoneset", "Call to \"SVoiceGetData\" failed")) return NULL; } else { /* not multilingual voice */ (*multilingual) = FALSE; phoneset = S_PHONESET(SVoiceGetData(voice, "phoneset", error)); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SVoiceGetData\" failed")) return NULL; } if (phoneset == NULL) { S_CTX_ERR(error, S_FAILURE, "_get_phoneset", "Item phoneset is NULL, required to extract phone features"); return NULL; } return phoneset; }
/** * Prepare the structure for the next call to hunpos. * @private * * @param relation_head starting SItem for this new phrase. It must be a Phrase SItem if @p is_phrase_present is true, a Token otherwise. * @param data data structure to fill with new SItem pointers. * @param is_phrase_present tells if we're using phrases or directly tokens * @param error Error code. * */ static void call_hunpos(const SHunposUttProc *hunposProc, const SItem* relation_head, const SItem** data, s_bool is_phrase_present, s_erc *error) { const SItem* phrase_start_item; const SItem* current_token; const SItem* start_token; /* start point of the next phrase */ const SItem* last_safe_cut_token; /* last token from where we can cut for the next phrase */ int last_safe_cut_count; int tokens_count; S_CLR_ERR(error); phrase_start_item = relation_head; /* we already have the starting item if we have only tokens, else grab the token */ start_token = phrase_start_item; if (is_phrase_present) { /* go grab the first token of this phrase */ start_token = SItemPathToItem(relation_head, "R:Phrase.daughter.R:Token.parent", error); if (S_CHK_ERR(error, S_CONTERR, "call_hunpos", "Call to \"SItemPathToItem\" failed")) return; } current_token = start_token; /* for each phrase */ while (current_token != NULL) { tokens_count = 0; last_safe_cut_count = 0; start_token = current_token; last_safe_cut_token = current_token; /* we stop collecting tokens if the next item is null or if we reached the maximum number of tokens allowed */ while (current_token != NULL && tokens_count < hunposProc->max_tokens_number) { data[tokens_count] = current_token; /* check if it's a safe cut point */ s_bool is_present = SItemFeatureIsPresent(current_token, "IsPunctuation", error); if (S_CHK_ERR(error, S_CONTERR, "call_hunpos", "Call to \"SItemFeatureIsPresent\" failed")) return; if (is_present) { /* is it a punctiation token? */ sint32 is_punctuation = SItemGetInt(current_token, "IsPunctuation", error); if (is_punctuation > 0) { last_safe_cut_token = current_token; last_safe_cut_count = tokens_count + 1; } } current_token = SItemNext(current_token, error); if (S_CHK_ERR(error, S_CONTERR, "call_hunpos", "Call to \"SItemNext\" failed")) return; /* if we're using phrases, check if the current phrase finished */ if (is_phrase_present) { if (current_token != NULL) { const SItem* next_token_parent = SItemPathToItem(current_token, "R:Token.daughter.R:Phrase.parent", error); if (S_CHK_ERR(error, S_CONTERR, "call_hunpos", "Call to \"SItemPathToItem\" failed")) return; if (next_token_parent != phrase_start_item) { /* save the next phrase start and stop the cycle*/ phrase_start_item = next_token_parent; start_token = current_token; current_token = NULL; } } else { phrase_start_item = NULL; start_token = NULL; } } tokens_count++; } /* do we need to cut it? */ if (current_token != NULL && last_safe_cut_token != start_token) { tokens_count = last_safe_cut_count; current_token = SItemNext(last_safe_cut_token, error); if (S_CHK_ERR(error, S_CONTERR, "call_hunpos", "Call to \"SItemNext\" failed")) return; } /* do the tagging */ int hunpos_error = 0; hunpos_tagger_tag(hunposProc->hunpos_instance, tokens_count, data, &read_token, data, &set_tag, &hunpos_error); if (hunpos_error !=0) { S_CTX_ERR(error, S_FAILURE, "call_hunpos", "Call to \"hunpos_tagger_tag\" failed"); return; } /* if we're using phrases, go on with the next one */ if (is_phrase_present && current_token == NULL) current_token = start_token; } return; }
static void Run(const SUttProcessor *self, SUtterance *utt, s_erc *error) { SCrfSuiteUttProc *crfsuiteProc = (SCrfSuiteUttProc*)self; crfsuite_model_t * ptr_model = malloc (sizeof(crfsuite_tagger_t)); crfsuite_tagger_t * ptr_tagger = malloc (sizeof(crfsuite_tagger_t)); crfsuite_dictionary_t * ptr_attrs = malloc (sizeof(crfsuite_dictionary_t)); crfsuite_dictionary_t * ptr_labels = malloc (sizeof(crfsuite_dictionary_t)); crfsuite_instance_t * instance = malloc (sizeof(crfsuite_instance_t)); /* Initialize model object */ if ( crfsuite_create_instance_from_file( crfsuiteProc->model_file, (void**)&ptr_model ) != 0 ) { goto exit_cleanup; } const SRelation* phrase = SUtteranceGetRelation(utt, "Phrase", error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SUtteranceGetRelation\" failed")) return; SItem* itrPhrase = SRelationHead( phrase, error ); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SRelationHead\" failed")) return; while ( itrPhrase != NULL ) { /* Obtain the dictionary interface representing the labels in the model. */ if ( ptr_model->get_labels(ptr_model, &ptr_labels) != 0) { goto exit_cleanup; } /* Obtain the dictionary interface representing the attributes in the model. */ if ( ptr_model->get_attrs(ptr_model, &ptr_attrs) != 0) { goto exit_cleanup; } /* Obtain the tagger interface. */ if ( ptr_model->get_tagger(ptr_model, &ptr_tagger) != 0) { goto exit_cleanup; } instance = create_phrase_instance ( itrPhrase, ptr_attrs, ptr_labels, error ); int *output = calloc(sizeof(int), instance->num_items); floatval_t score = 0; /* Set the instance to the tagger. */ if ( ptr_tagger->set(ptr_tagger, instance) != 0) { goto exit_cleanup; } /* Obtain the viterbi label sequence. */ if (ptr_tagger->viterbi(ptr_tagger, output, &score) != 0) { goto exit_cleanup; } /* Extract the output and insert in the POS attribute */ const SItem* tokenTMP = SItemPathToItem ( itrPhrase, "daughter.R:Token", error ); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemPathToItem\" failed")) return; SItem* token = SItemParent (tokenTMP, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemParent\" failed")) return; const SItem* lastToken = SItemPathToItem ( itrPhrase, "n.daughter.R:Token.parent", error ); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemPathToItem\" failed")) return; int i = 0; while ( token != NULL && token != lastToken ) { const char * str = malloc (sizeof (char)*16); ptr_labels->to_string (ptr_labels, output[i], &str); i += 1; SItemSetString (token, "POS", str, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemSetString\" failed")) return; token = SItemNext(token, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemNext\" failed")) return; } free(output); crfsuite_instance_finish(instance); itrPhrase = SItemNext(itrPhrase, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemNext\" failed")) return; } /* here all is OK */ S_UNUSED(utt); exit_cleanup: if ( ptr_model != NULL ) free ( ptr_model ); if ( ptr_tagger != NULL ) free ( ptr_tagger ); if ( ptr_attrs != NULL ) free ( ptr_attrs ); if ( ptr_labels != NULL ) free ( ptr_labels ); if ( instance != NULL ) free ( instance ); }
static crfsuite_instance_t* create_phrase_instance ( SItem* phrase, crfsuite_dictionary_t* attrs, crfsuite_dictionary_t* labels, s_erc *error) { crfsuite_instance_t * result = malloc ( sizeof(crfsuite_instance_t) ); int i = 0; int L = labels->num(labels); const SItem* itrItem = NULL; const SItem* itrItemNext = NULL; const SItem* finishItem = NULL; const char* lbl[] = {"num", "sym", "cap", "p1", "p2", "p3", "s1", "s2", "s3", "P1", "P2", "P3", "P4", "S1", "S2", "S3", "S4", "S5", "S6", "w" }; const int words_length = 19; const char* words[19] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL}; int position = 0; int lbl_counter = 0; char buffer[8192]; crfsuite_instance_init ( result ); itrItemNext = SItemPathToItem (phrase, "daughter.R:Token.parent", error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemPathToItem\" failed")) return NULL; finishItem = SItemPathToItem (phrase, "n.daughter.R:Token.parent", error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemPathToItem\" failed")) return NULL; int counter = 0; while ( itrItemNext != finishItem && itrItemNext != NULL && counter < 9) { words[counter] = SItemGetName(itrItemNext, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemGetName\" failed")) return NULL; counter++; itrItemNext = SItemNext(itrItemNext, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemNext\" failed")) return NULL; } itrItem = SItemPathToItem (phrase, "daughter.R:Token.parent", error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemPathToItem\" failed")) return NULL; while ( itrItem != finishItem && itrItem != NULL) { /* Extraction of the features for each token */ const char *tokenName = SItemGetName (itrItem, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemGetName\" failed")) return NULL; lbl_counter = 0; /* Extraction of label's ID */ int tokenID = labels->to_id(labels, "UNK"); int attribute_id; /* If unknown the set the 0 labels (unknown) */ if (tokenID < 0) tokenID = L; crfsuite_item_t itemToken; crfsuite_attribute_t attribute; crfsuite_item_init(&itemToken); const char *feat = NULL; s_bool found = FALSE; /* if token contains numbers */ found = hasNumber (tokenName, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"hasNumber\" failed")) return NULL; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N"); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); /* if token contains symbols */ found = hasSymbol (tokenName, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"hasSymbol\" failed")) return NULL; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N"); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); /* if token contains Capitals */ found = hasCapital (tokenName, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"hasCapital\" failed")) return NULL; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, found? "Y" : "N"); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); /* Prefixes of 1 to 3 chars of token */ i = 1; while ( i < 4 ) { feat = getFirstChars ( tokenName, i, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"getFirstChars\" failed")) return NULL; if ( feat == NULL ) feat = "__nil__"; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); i += 1; } /* Suffixes of 1 to 3 chars of token */ i = 1; while ( i < 4 ) { feat = getLastChars ( tokenName, i, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"getLastChars\" failed")) return NULL; if ( feat == NULL ) feat = "__nil__"; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); i += 1; } /* Prefixes of 1 to 4 chars of token without duplicates adjacent */ i = 1; while ( i < 5 ) { feat = getFirstChars ( removeDoubles(tokenName, error), i, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"removeDoubles\" failed")) return NULL; if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"getFirstChars\" failed")) return NULL; if ( feat == NULL ) feat = "__nil__"; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); i += 1; } /* Suffixes of 1 to 6 chars of token without duplicates adjacent */ i = 1; while ( i < 7 ) { feat = getLastChars ( removeDoubles(tokenName, error), i, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"removeDoubles\" failed")) return NULL; if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"getFirstChars\" failed")) return NULL; if ( feat == NULL ) feat = "__nil__"; sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, feat); lbl_counter++; attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); i += 1; } /* Words features */ sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 0, words[(position+0+words_length)%words_length]); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); const char *tmp = words[(position-1+words_length)%words_length]; if(tmp!=NULL) { sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], -1, tmp); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } tmp = words[(position+1+words_length)%words_length]; if(tmp!=NULL) { sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 1, tmp); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } tmp = words[(position-2+words_length)%words_length]; if(tmp!=NULL) { sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], -2, tmp); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } tmp = words[(position+2+words_length)%words_length]; if(tmp!=NULL) { sprintf(buffer, "%s[%d]=%s", lbl[lbl_counter], 2, tmp); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } const char * tmp1 = NULL; i = 0; while ( i < 4 ) { tmp = words[(position-2+i+words_length)%words_length]; tmp1 = words[(position-1+i+words_length)%words_length]; if( tmp!=NULL && tmp1 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], -2+i, lbl[lbl_counter], -1+i, tmp, tmp1 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } i++; } const char * tmp2 = NULL; i = 0; while ( i < 3 ) { tmp = words[(position-2+i+words_length)%words_length]; tmp1 = words[(position-1+i+words_length)%words_length]; tmp2 = words[(position+0+i+words_length)%words_length]; if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]=%s|%s|%s", lbl[lbl_counter], -2+i, lbl[lbl_counter], -1+i, lbl[lbl_counter], +0+i, tmp, tmp1, tmp2 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } i++; } const char * tmp3 = NULL; i = 0; while ( i < 2 ) { tmp = words[(position-2+i+words_length)%words_length]; tmp1 = words[(position-1+i+words_length)%words_length]; tmp2 = words[(position+0+i+words_length)%words_length]; tmp3 = words[(position+1+i+words_length)%words_length]; if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL && tmp3 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]|%s[%d]=%s|%s|%s|%s", lbl[lbl_counter], -2+i, lbl[lbl_counter], -1+i, lbl[lbl_counter], +0+i, lbl[lbl_counter], +1+i, tmp, tmp1, tmp2, tmp3 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } i++; } tmp = words[(position-2+words_length)%words_length]; tmp1 = words[(position-1+words_length)%words_length]; tmp2 = words[(position+0+words_length)%words_length]; tmp3 = words[(position+1+words_length)%words_length]; const char * tmp4 = words[(position+2+words_length)%words_length]; if( tmp!=NULL && tmp1 != NULL && tmp2 != NULL && tmp3 != NULL && tmp4 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]|%s[%d]|%s[%d]|%s[%d]=%s|%s|%s|%s|%s", lbl[lbl_counter], -2, lbl[lbl_counter], -1, lbl[lbl_counter], +0, lbl[lbl_counter], +1, lbl[lbl_counter], +2, tmp, tmp1, tmp2, tmp3, tmp4 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } tmp = words[(position+0+words_length)%words_length]; i = 1; while ( i < 10 ) { tmp1 = words[(position-i+words_length)%words_length]; if ( tmp1 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], 0, lbl[lbl_counter], 0-i, tmp, tmp1 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } i++; } i = 1; while ( i < 10 ) { tmp1 = words[(position+i+words_length)%words_length]; if ( tmp1 != NULL ) { sprintf(buffer, "%s[%d]|%s[%d]=%s|%s", lbl[lbl_counter], 0, lbl[lbl_counter], 0+i, tmp, tmp1 ); attribute_id = attrs->to_id (attrs,buffer); crfsuite_attribute_set (&attribute, attribute_id, 1.0); crfsuite_item_append_attribute(&itemToken, &attribute); } i++; } /* Update words array */ position++; if ( itrItemNext != NULL ) { words[position+9] = SItemGetName (itrItemNext, error); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemNext\" failed")) return NULL; itrItemNext = SItemNext ( itrItemNext, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemNext\" failed")) return NULL; } else words[position+9] = NULL; itrItem = SItemNext ( itrItem, error ); if (S_CHK_ERR(error, S_CONTERR, "create_phrase_instance", "Call to \"SItemNext\" failed")) return NULL; crfsuite_instance_append(result, &itemToken, tokenID); crfsuite_item_finish(&itemToken); } return result; }
static SObject *Run(const SFeatProcessor *self, const SItem *item, s_erc *error) { SObject *extractedFeat = NULL; const SItem *phraseItem; const SItem *wordItem; sint32 num_content = 0; S_CLR_ERR(error); if (item == NULL) return NULL; /* get current phrase */ phraseItem = SItemPathToItem(item, "R:Phrase.parent", error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemPathToItem\" failed")) goto quit_error; if (phraseItem == NULL) { S_CTX_ERR(error, S_FAILURE, "Run", "Failed to get phrase of given word"); goto quit_error; } wordItem = SItemDaughter(phraseItem, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemDaughter\" failed")) goto quit_error; while (wordItem != NULL) { s_bool is_content; s_bool is_current_word; is_current_word = SItemEqual(wordItem, item, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemEqual\" failed")) goto quit_error; if (is_current_word) break; is_content = word_is_content(wordItem, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"word_is_content\" failed")) goto quit_error; if (is_content) num_content++; wordItem = SItemNext(wordItem, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SItemNext\" failed")) goto quit_error; } extractedFeat = SObjectSetInt(num_content, error); if (S_CHK_ERR(error, S_CONTERR, "Run", "Call to \"SObjectSetInt\" failed")) goto quit_error; /* all OK here */ return extractedFeat; /* error cleanup */ quit_error: if (extractedFeat != NULL) S_DELETE(extractedFeat, "Run", error); return NULL; S_UNUSED(self); }