int cq_fields_to_utf8(char *buf, size_t buflen, size_t fieldc, char **fieldnames, bool usequotes) { UChar *buf16; UErrorCode status = U_ZERO_ERROR; size_t num_left = fieldc; int rc = 0; if (num_left == 0) return 1; buf16 = calloc(buflen, sizeof(UChar)); if (buf16 == NULL) return -1; for (size_t i = 0; i < fieldc; ++i) { UChar *temp = calloc(buflen, sizeof(UChar)); if (temp == NULL) { rc = -2; break; } u_strFromUTF8(temp, buflen, NULL, fieldnames[i], strlen(fieldnames[i]), &status); if (!U_SUCCESS(status)) { rc = 2; free(temp); break; } bool isstr = false; if (usequotes) { for (int32_t j = 0; j < u_strlen(temp); ++j) { if (!isdigit(temp[j])) { isstr = true; break; } } } if (isstr) u_strcat(buf16, u"'"); u_strcat(buf16, temp); if (isstr) u_strcat(buf16, u"'"); free(temp); if (--num_left > 0) { u_strcat(buf16, u","); } } u_strToUTF8(buf, buflen, NULL, buf16, u_strlen(buf16), &status); if (!U_SUCCESS(status)) rc = 3; free(buf16); return rc; }
// // this function explores a sub-graph, considering tokens as strings // void explorer_sub_automate_normalization_string(Fst2* automate,int n, struct normalization_tree* noeud_normalization, unichar* output,struct norm_info** TEMP_LIST) { Fst2State etat; etat=automate->states[n]; if (is_final_state(etat)) { // if we are in a final state (*TEMP_LIST)=insert_in_norm_info_list(output,noeud_normalization,(*TEMP_LIST)); } Transition* trans; trans=etat->transitions; unichar tmp[1000]; while (trans!=NULL) { if (trans->tag_number<0) { // case of a sub-graph struct norm_info* TMP=NULL; explorer_sub_automate_normalization_string(automate,automate->initial_states[-(trans->tag_number)],noeud_normalization, output,&TMP); while (TMP!=NULL) { // we continue to explore the current automaton explorer_sub_automate_normalization_string(automate,trans->state_number,TMP->node, TMP->output,TEMP_LIST); struct norm_info* z=TMP; TMP=TMP->next; free_norm_info(z); } } else { // normal transition Fst2Tag etiq; etiq=automate->tags[trans->tag_number]; u_strcpy(tmp,output); u_strcat(tmp," "); if (etiq->output!=NULL && u_strcmp(etiq->output,"") && u_strcmp(etiq->output,"<E>") && !only_spaces(etiq->output)) { // we append the output if it exists and is not epsilon u_strcat(tmp,etiq->output); } struct normalization_tree_transition* trans_norm; trans_norm=get_trans_arbre_normalization_string(etiq->input,noeud_normalization->trans); if (trans_norm==NULL) { // if the transition does not exist in the tree, we create it trans_norm=new_trans_arbre_normalization_string(etiq->input); // we also create the destination node trans_norm->node=new_normalization_tree(); trans_norm->next=noeud_normalization->trans; noeud_normalization->trans=trans_norm; } explorer_sub_automate_normalization_string(automate,trans->state_number,trans_norm->node, tmp,TEMP_LIST); } trans=trans->next; } }
/** * Allocates, initializes and returns a new corpus_entry structure. */ struct corpus_entry* new_corpus_entry(const unichar* line){ struct corpus_entry* entry = (corpus_entry*)malloc(sizeof(corpus_entry)); if(entry == NULL){ fatal_alloc_error("compute_corpus_entry"); } /* we fill corpus entry with information extracted from the corpus line*/ int pos = u_strrchr(line,'/'); if(pos == -1){ fatal_error("Wrong format for line %S\n",line); } entry->word = (unichar*)malloc(sizeof(unichar)*(pos+1)); if(entry->word == NULL){ fatal_alloc_error("compute_corpus_entry"); } unichar* tmp = u_strcpy_sized(entry->word,pos+1,line); u_strcat(tmp,"\0"); int code_pos = u_strrchr(line,':'); /* there are no morphological codes associated to this entry */ if(code_pos == -1){ entry->pos_code = (unichar*)malloc(sizeof(unichar)*(u_strlen(line)-pos)); if(entry->pos_code == NULL){ fatal_alloc_error("new_corpus_entry"); } u_strcpy(entry->pos_code,&line[pos+1]); entry->overall_codes = u_strdup(entry->pos_code); } else{ entry->pos_code = (unichar*)malloc(sizeof(unichar)*(code_pos-pos)); if(entry->pos_code == NULL){ fatal_alloc_error("new_corpus_entry"); } entry->overall_codes = (unichar*)malloc(sizeof(unichar)*(u_strlen(line)-pos)); if(entry->overall_codes == NULL){ fatal_alloc_error("new_corpus_entry"); } unichar* tmp2 = u_strcpy_sized(entry->pos_code,code_pos-pos,&line[pos+1]); u_strcat(tmp2,"\0"); u_strcpy(entry->overall_codes,&line[pos+1]); } /* if the token is not annotated in the corpus, we put "UNK" */ if(u_strlen(entry->pos_code) == 0){ free(entry->pos_code); free(entry->overall_codes); entry->pos_code = u_strdup("UNK"); entry->overall_codes = u_strdup("UNK"); } return entry; }
static void demo_C_Unicode_strings() { printf("\n* demo_C_Unicode_strings() --------- ***\n\n"); static const UChar text[]={ 0x41, 0x42, 0x43, 0 }; /* "ABC" */ static const UChar appendText[]={ 0x61, 0x62, 0x63, 0 }; /* "abc" */ static const UChar cmpText[]={ 0x61, 0x53, 0x73, 0x43, 0 }; /* "aSsC" */ UChar buffer[32]; int32_t compare; int32_t length=u_strlen(text); /* length=3 */ /* simple ANSI C-style functions */ buffer[0]=0; /* empty, NUL-terminated string */ u_strncat(buffer, text, 1); /* append just n=1 character ('A') */ u_strcat(buffer, appendText); /* buffer=="Aabc" */ length=u_strlen(buffer); /* length=4 */ printUString("should be \"Aabc\": ", buffer, -1); /* bitwise comparing buffer with text */ compare=u_strcmp(buffer, text); if(compare<=0) { printf("String comparison error, expected \"Aabc\" > \"ABC\"\n"); } /* Build "A<sharp s>C" in the buffer... */ u_strcpy(buffer, text); buffer[1]=0xdf; /* sharp s, case-compares equal to "ss" */ printUString("should be \"A<sharp s>C\": ", buffer, -1); /* Compare two strings case-insensitively using full case folding */ compare=u_strcasecmp(buffer, cmpText, U_FOLD_CASE_DEFAULT); if(compare!=0) { printf("String case insensitive comparison error, expected \"AbC\" to be equal to \"ABC\"\n"); } }
unichar* read_file(U_FILE *f){ unichar *text = NULL; text = (unichar *)malloc(sizeof(unichar)); if(text==NULL){ fatal_alloc_error("malloc"); } text[0]='\0'; int total_read = 0; int read; do { unichar buffer[READ_FILE_BUFFER_SIZE+1]; memset(buffer,0,sizeof(unichar)*(READ_FILE_BUFFER_SIZE+1)); int ok=1; read = u_fread(buffer,READ_FILE_BUFFER_SIZE,f,&ok); total_read += u_strlen(buffer); text = (unichar *)realloc(text,sizeof(unichar)*(total_read+1)); if(text==NULL){ fatal_alloc_error("realloc"); } u_strcat(text,buffer); } while (read == READ_FILE_BUFFER_SIZE); text[total_read]='\0'; return text; }
corpus_entry* new_simple_word_entry(const unichar* word,corpus_entry* entry,int start){ corpus_entry* wentry = (corpus_entry*)malloc(sizeof(corpus_entry)); wentry->word = u_strdup(word); wentry->pos_code = (unichar*)malloc(sizeof(unichar)*(u_strlen(entry->pos_code)+3)); wentry->overall_codes = (unichar*)malloc(sizeof(unichar)*(u_strlen(entry->overall_codes)+3)); unichar* tmp = u_strcpy_sized(wentry->pos_code,u_strlen(entry->pos_code)+1,entry->pos_code); unichar* tmp2 = u_strcpy_sized(wentry->overall_codes,u_strlen(entry->overall_codes)+1,entry->overall_codes); if(start == 0){ u_strcat(tmp,"+I\0"); u_strcat(tmp2,"+I\0"); } else { u_strcat(tmp,"+B\0"); u_strcat(tmp2,"+B\0"); } return wentry; }
// // returns 1 if the INF code refers to a valid right component, 0 else // char check_valid_right_component_for_one_INF_code_german(const unichar* s) { unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,s); struct dela_entry* d=tokenize_DELAF_line(temp,0); char res=check_N_not_FF(d); free_dela_entry(d); return res; }
int check_is_valid_for_one_INF_code(const unichar* t, const unichar* s) { unichar temp[MAX_DICT_LINE_LENGTH]; u_strcpy(temp,"x,"); u_strcat(temp,s); struct dela_entry* d = tokenize_DELAF_line(temp,0); int res = check_is_valid(t, d); free_dela_entry(d); return res; }
/** * Returns 1 if the given INF code is a ":a" one. */ char check_a(unichar* INF_code) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,INF_code); struct dela_entry* d=tokenize_DELAF_line(temp,0); char res=check_a(d); /* We free the artifical dictionary entry */ free_dela_entry(d); return res; }
/** * Returns 1 if the INF code refers to a valid left component, 0 otherwise. */ char check_valid_right_component_for_one_INF_code(const unichar* INF_code) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,INF_code); struct dela_entry* d=tokenize_DELAF_line(temp,0); char res=(check_N(d)||check_A(d)/*||check_V_but_not_Y(d)*/)&&(!check_Nsie(d)); /* We free the artifical dictionary entry */ free_dela_entry(d); return res; }
/** * Returns 1 if the INF code refers to a valid left component, 0 otherwise. */ char check_valid_left_component_for_one_INF_code(const unichar* INF_code) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,INF_code); struct dela_entry* d=tokenize_DELAF_line(temp,0); /* Now, we can use this structured representation to check if the INF code * corresponds to a valid left component. */ char res=check_Nsia(d)||check_Nsie(d)||check_Nsig(d)||check_Asio(d)||check_Asie(d)||check_VW(d)||check_ADV(d); /* Finally, we free the artificial dictionary entry */ free_dela_entry(d); return res; }
/** * Returns 1 if the line is a valid right "A" component. */ char check_A_right_component(unichar* s) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,s); struct dela_entry* d=tokenize_DELAF_line(temp,0); unichar t1[2]; u_strcpy(t1,"A"); unichar t2[4]; u_strcpy(t2,"sie"); char res=dic_entry_contain_gram_code(d,t1) && !dic_entry_contain_inflectional_code(d,t2); /* We free the artifical dictionary entry */ free_dela_entry(d); return res; }
unichar_t *u_GFileGetAbsoluteName(unichar_t *name, unichar_t *result, int rsiz) { /* result may be the same as name */ unichar_t buffer[1000]; if ( ! u_GFileIsAbsolute(name) ) { unichar_t *pt, *spt, *rpt, *bpt; if ( dirname_[0]=='\0' ) { getcwd(dirname_,sizeof(dirname_)); } uc_strcpy(buffer,dirname_); if ( buffer[u_strlen(buffer)-1]!='/' ) uc_strcat(buffer,"/"); u_strcat(buffer,name); _u_backslash_to_slash(buffer); /* Normalize out any .. */ spt = rpt = buffer; while ( *spt!='\0' ) { if ( *spt=='/' ) ++spt; for ( pt = spt; *pt!='\0' && *pt!='/'; ++pt ); if ( pt==spt ) /* Found // in a path spec, reduce to / (we've*/ u_strcpy(spt,pt); /* skipped past the :// of the machine name) */ else if ( pt==spt+1 && spt[0]=='.' && *pt=='/' ) /* Noop */ u_strcpy(spt,spt+2); else if ( pt==spt+2 && spt[0]=='.' && spt[1]=='.' ) { for ( bpt=spt-2 ; bpt>rpt && *bpt!='/'; --bpt ); if ( bpt>=rpt && *bpt=='/' ) { u_strcpy(bpt,pt); spt = bpt; } else { rpt = pt; spt = pt; } } else spt = pt; } name = buffer; } if (result!=name) { u_strncpy(result,name,rsiz); result[rsiz-1]='\0'; _u_backslash_to_slash(result); } return(result); }
static EC_OBJ ustring_radd( EC_OBJ obj1, EC_OBJ obj2 ) { /* ec_string str; */ EC_OBJ res; EcInt len; if (! EC_USTRINGP(obj2)) return EcTypeError( EC_NIL, EC_NIL, 2, tc_string, obj2, TRUE, "string radd" ); EC_ASSERT( EC_USTRINGP(obj1) ); EC_ASSERT( EC_USTRINGP(obj2) ); len = EC_USTRLEN(obj1) + EC_USTRLEN(obj2); res = EcMakeUString( EC_USTRDATA(obj2), len , EcTrue ); u_strcat(EC_USTRDATA(res), EC_USTRDATA(obj1) ); EC_USTRLEN(res) = len; return res; }
static EC_OBJ ustring_add( EC_OBJ obj1, EC_OBJ obj2 ) { /* ec_string str; */ EC_OBJ res; EcInt len; /* add chars later */ if (/* (! EC_CHARP(obj2)) && */(! EC_USTRINGP(obj2))) return EcTypeError( EC_NIL, EC_NIL, 2, tc_none, obj2, TRUE, "string add" ); EC_ASSERT( EC_USTRINGP(obj1) ); len = EC_USTRLEN(obj1) + EC_USTRLEN(obj2); res = EcMakeUString( EC_USTRDATA(obj1), len , EcTrue ); u_strcat(EC_USTRDATA(res), EC_USTRDATA(obj2) ); EC_USTRLEN(res) = len; return res; }
/** * This function analyzes an INF code and returns a value that indicates * if it is a valid left component or not. */ int get_valid_left_component_type_for_one_INF_code(const unichar* INF_code) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,INF_code); struct dela_entry* d=tokenize_DELAF_line(temp,0); int res; /* Now we can test if the INF code corresponds to a valid left component */ if (check_Nsia(d)) res=N_SIA; else if (check_Nsie(d)) res=N_SIE; else if (check_Nsig(d)) res=N_SIG; else if (check_Asio(d)) res=A_SIO; else if (check_Asie(d)) res=A_SIE; else if (check_VW(d)) res=V_W; else if (check_ADV(d)) res=ADV; else res=INVALID_LEFT_COMPONENT; /* Finally we free the artifical dictionary entry */ free_dela_entry(d); return res; }
static void TestCompare(){ int32_t i; const char* testName ="uidna_compare"; CompareFunc func = uidna_compare; UChar www[] = {0x0057, 0x0057, 0x0057, 0x002E, 0x0000}; UChar com[] = {0x002E, 0x0043, 0x004F, 0x004D, 0x0000}; UChar buf[MAX_DEST_SIZE]={0x0057, 0x0057, 0x0057, 0x002E, 0x0000}; UChar source[MAX_DEST_SIZE]={0}, uni0[MAX_DEST_SIZE]={0}, uni1[MAX_DEST_SIZE]={0}, ascii0[MAX_DEST_SIZE]={0}, ascii1[MAX_DEST_SIZE]={0}, temp[MAX_DEST_SIZE] ={0}; u_strcat(uni0,unicodeIn[0]); u_strcat(uni0,com); u_strcat(uni1,unicodeIn[1]); u_strcat(uni1,com); u_charsToUChars(asciiIn[0], temp, (int32_t)strlen(asciiIn[0])); u_strcat(ascii0,temp); u_strcat(ascii0,com); memset(temp, 0, U_SIZEOF_UCHAR * MAX_DEST_SIZE); u_charsToUChars(asciiIn[1], temp, (int32_t)strlen(asciiIn[1])); u_strcat(ascii1,temp); u_strcat(ascii1,com); /* prepend www. */ u_strcat(source, www); for(i=0;i< (int32_t)(sizeof(unicodeIn)/sizeof(unicodeIn[0])); i++){ UChar* src; int32_t srcLen; memset(buf+4, 0, (MAX_DEST_SIZE-4) * U_SIZEOF_UCHAR); u_charsToUChars(asciiIn[i],buf+4, (int32_t)strlen(asciiIn[i])); u_strcat(buf,com); /* for every entry in unicodeIn array prepend www. and append .com*/ source[4]=0; u_strncat(source,unicodeIn[i], u_strlen(unicodeIn[i])); u_strcat(source,com); /* a) compare it with itself*/ src = source; srcLen = u_strlen(src); testCompareWithSrc(src,srcLen,src,srcLen,testName, func, TRUE); /* b) compare it with asciiIn equivalent */ testCompareWithSrc(src,srcLen,buf,u_strlen(buf),testName, func,TRUE); /* c) compare it with unicodeIn not equivalent*/ if(i==0){ testCompareWithSrc(src,srcLen,uni1,u_strlen(uni1),testName, func,FALSE); }else{ testCompareWithSrc(src,srcLen,uni0,u_strlen(uni0),testName, func,FALSE); } /* d) compare it with asciiIn not equivalent */ if(i==0){ testCompareWithSrc(src,srcLen,ascii1,u_strlen(ascii1),testName, func,FALSE); }else{ testCompareWithSrc(src,srcLen,ascii0,u_strlen(ascii0),testName, func,FALSE); } } }
/** * Explores the given dictionary to match the given word. */ static void explore_dic(int offset,unichar* word,int pos_word,Dictionary* d,SpellCheckConfig* cfg, Ustring* output,SpellCheckHypothesis* *list,int base,Ustring* inflected) { int original_offset=offset; int original_base=base; int final,n_transitions,inf_code; int z=save_output(output); int size_pairs=cfg->pairs->nbelems; offset=read_dictionary_state(d,offset,&final,&n_transitions,&inf_code); if (final) { if (word[pos_word]=='\0') { /* If we have a match */ deal_with_matches(d,inflected->str,inf_code,output,cfg,base,list); } base=output->len; } /* If we are at the end of the token, then we stop */ if (word[pos_word]=='\0') { return; } unsigned int l2=inflected->len; unichar c; int dest_offset; for (int i=0;i<n_transitions;i++) { restore_output(z,output); offset=read_dictionary_transition(d,offset,&c,&dest_offset,output); /* For backup_output, see comment below */ int backup_output=save_output(output); if (c==word[pos_word] || word[pos_word]==u_toupper(c)) { u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } else { /* We deal with the SP_SWAP case, made of 2 SP_CHANGE_XXX */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SWAP!=cfg->max_SP_SWAP && is_letter_swap(cfg,word,pos_word,inflected,c)) { /* We don't modify the number of errors since we override an existing * SP_CHANGE_XXX one */ cfg->current_SP_SWAP++; /* We override the previous change */ int a=cfg->pairs->tab[cfg->pairs->nbelems-2]; int b=cfg->pairs->tab[cfg->pairs->nbelems-1]; cfg->pairs->tab[cfg->pairs->nbelems-2]=pos_word-1; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_SWAP_DEFAULT; u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); cfg->pairs->tab[cfg->pairs->nbelems-2]=a; cfg->pairs->tab[cfg->pairs->nbelems-1]=b; cfg->current_SP_SWAP--; } else /* We deal with the SP_CHANGE case */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_CHANGE!=cfg->max_SP_CHANGE /* We want letters, not spaces or anything else */ && is_letter(c,NULL) /* We do not allow the replacement of a lowercase letter by an uppercase * letter at the beginning of the word like Niserable, unless the whole word * is in uppercase or the letter is the same, module the case */ && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL) || word[0]==u_toupper(c)))) { cfg->current_errors++; cfg->current_SP_CHANGE++; /* Now we test all possible kinds of change */ vector_int_add(cfg->pairs,pos_word); u_strcat(inflected,c); /* We always add the default case */ vector_int_add(cfg->pairs,SP_CHANGE_DEFAULT); int n_elem=cfg->pairs->nbelems; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); /* Then we test the accent case */ if (u_deaccentuate(c)==u_deaccentuate(word[pos_word])) { /* After a call to explore_dic, we must restore the output. * But, when dealing with SP_CHANGE_XXX ops, we must restore the * output including the output associated to the current transition, * which is why we don't use z (output before the current transition) * but backup_output */ restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_DIACRITIC; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } /* And the case variations */ if (u_tolower(c)==u_tolower(word[pos_word])) { restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_CASE; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } /* And finally the position on keyboard */ if (areCloseOnKeyboard(c,word[pos_word],cfg->keyboard)) { restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_KEYBOARD; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_CHANGE--; /* End of the SP_CHANGE case */ } } restore_output(backup_output,output); truncate(inflected,l2); /* Now we deal with the SP_SUPPR case */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SUPPR!=cfg->max_SP_SUPPR /* We want letters, not spaces or anything else */ && is_letter(c,NULL)) { cfg->current_errors++; cfg->current_SP_SUPPR++; vector_int_add(cfg->pairs,pos_word); if (pos_word>=1 && c==word[pos_word-1]) { vector_int_add(cfg->pairs,SP_SUPPR_DOUBLE); } else { vector_int_add(cfg->pairs,SP_SUPPR_DEFAULT); } u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word,d,cfg,output,list,original_base,inflected); truncate(inflected,l2); cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_SUPPR--; } } restore_output(z,output); /* Finally, we deal with the SP_INSERT case, by calling again the current * function with the same parameters, except pos_word that will be increased of 1 */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_INSERT!=cfg->max_SP_INSERT /* We want letters, not spaces or anything else */ && is_letter(word[pos_word],NULL) /* We do not allow the insertion of a capital letter at the beginning of * the word like Astreet, unless the whole word is in uppercase like ASTREET */ && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL)))) { cfg->current_errors++; cfg->current_SP_INSERT++; vector_int_add(cfg->pairs,pos_word); if (pos_word>=1 && word[pos_word]==word[pos_word-1]) { vector_int_add(cfg->pairs,SP_INSERT_DOUBLE); } else { vector_int_add(cfg->pairs,SP_INSERT_DEFAULT); } explore_dic(original_offset,word,pos_word+1,d,cfg,output,list,original_base,inflected); truncate(inflected,l2); cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_INSERT--; } /* Finally, we restore the output as it was when we enter the function */ restore_output(z,output); }
/** * Explores all the partial matches to produce outputs in MERGE or REPLACE mode. * * If *var_starts!=NULL, it means that there are pending $var_start( tags * that wait for being taken into account when a text dependent tag is found. */ void explore_match_for_MERGE_or_REPLACE_mode(struct locate_tfst_infos* infos, struct tfst_simple_match_list* element, vector_ptr* items,int current_item,Ustring* s, int last_text_dependent_tfst_tag, struct list_pointer* *var_starts) { if (current_item==items->nbelems) { /* If we have finished, we can save the current output */ element->output=s->str; infos->matches=add_element_to_list(infos,infos->matches,element); element->output=NULL; return; } /* We save the length because it will be modified */ int len=s->len; struct tfst_match* item=(struct tfst_match*)(items->tab[current_item]); if (item==NULL) { fatal_error("Unexpected NULL item in explore_match_for_MERGE_mode\n"); } if (item->debug_output!=NULL) { /* If we have a debug output, we deal it */ u_strcat(s,item->debug_output); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_text_dependent_tfst_tag,var_starts); s->len=len; s->str[len]='\0'; return; } unichar* output=infos->fst2->tags[item->fst2_transition->tag_number]->output; unichar name[MAX_TRANSDUCTION_VAR_LENGTH]; int capture; struct dela_entry* old_value_dela=NULL; capture=is_capture_variable(output,name); if (capture) { /* If we have a capture variable $:X$, we must save the previous value * for this dictionary variable */ old_value_dela=clone_dela_entry(get_dic_variable(name,infos->dic_variables)); } Match saved_element=element->m; struct list_int* text_tags=item->text_tag_numbers; int captured_chars=0; /* We explore all the text tags */ while (text_tags!=NULL) { /* First, we restore the output string */ s->len=len; s->str[len]='\0'; captured_chars=0; /* We deal with the fst2 tag output, if any */ if (item->first_time) { /* We only have to process the output only once, * since it will have the same effect on all tfst tags. * * Example: the fst2 tag "cybercrime/ZZ" may match the two tfst tags "cyber" and * "crime", but we must process the "ZZ" output only before the first tfst tag "cyber" */ if (capture) { /* If we have a capture variable, then we have to check whether the tfst tag * is a tagged token or not */ int tfst_tag_number=text_tags->n; int fst2_tag_number=item->fst2_transition->tag_number; if (!do_variable_capture(tfst_tag_number,fst2_tag_number,infos,name)) { goto restore_dic_variable; } } else if (!deal_with_output_tfst(s,output,infos,&captured_chars)) { /* We do not take into account matches with variable errors if the * process_output_for_tfst_match function has decided that backtracking * was necessary, either because of a variable error of because of a * $a.SET$ or $a.UNSET$ test */ goto restore_dic_variable; } } int last_tag=last_text_dependent_tfst_tag; TfstTag* current_tag=NULL; if (text_tags->n==-1) { /* We have a text independent match */ Fst2Tag fst2_tag=infos->fst2->tags[item->fst2_transition->tag_number]; if (fst2_tag->type==BEGIN_OUTPUT_VAR_TAG) { /* If we an output variable start $|a( */ int var_index=get_value_index(fst2_tag->variable,infos->output_variables->variable_index); Ustring* old_value = new_Ustring(); swap_output_variable_content(infos->output_variables, var_index, old_value); // now old_value contain the backup set_output_variable_pending(infos->output_variables,fst2_tag->variable); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); unset_output_variable_pending(infos->output_variables,fst2_tag->variable); // restore the good content from backup swap_output_variable_content(infos->output_variables, var_index, old_value); free_Ustring(old_value); goto restore_dic_variable; } else if (fst2_tag->type==END_OUTPUT_VAR_TAG) { /* If we an output variable end $|a) */ unset_output_variable_pending(infos->output_variables,fst2_tag->variable); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); set_output_variable_pending(infos->output_variables,fst2_tag->variable); goto restore_dic_variable; } else if (fst2_tag->type==BEGIN_VAR_TAG) { /* If we have a variable start tag $a(, we add it to our * variable tag list */ struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable); int old_value=v->start_in_tokens; /* We add the address of the start field to our list */ (*var_starts)=new_list_pointer(&(v->start_in_tokens),(var_starts==NULL)?NULL:(*var_starts)); /* Then, we go on the next item */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); /* After the exploration, there are 2 cases: * 1) *var_starts is NULL: nothing to do * 2) *var_starts is not NULL: we reached the end of the items without findind any * text dependent match, so we can free the list */ free_list_pointer(*var_starts); (*var_starts)=NULL; v->start_in_tokens=old_value; /* If we have a $a( tag, we know that we can only have just one text tag * with special value -1 */ goto restore_dic_variable; } else if (fst2_tag->type==END_VAR_TAG) { /* If we have found a $a) tag */ if (last_tag==-1) { /* If we have no tfst tag to use, then it's a variable definition error, * and we have nothing special to do */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); goto restore_dic_variable; } else { /* We can set the end of the variable, it's 'last_tag' */ struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable); int old_value=v->end_in_tokens; v->end_in_tokens=last_tag; explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); v->end_in_tokens=old_value; goto restore_dic_variable; } } else if (fst2_tag->type==LEFT_CONTEXT_TAG) { /* If we have found a $* tag, we must reset the stack string and the * start position, so we save them */ unichar* old_stack=u_strdup(s->str); int old_pos_token=element->m.start_pos_in_token; int old_pos_char=element->m.start_pos_in_char; int old_pos_letter=element->m.start_pos_in_letter; /* We set the new values */ empty(s); element->m.start_pos_in_token=LEFT_CONTEXT_PENDING; /* We must reset last_tag to -1, because is not, we will have an * extra space on the left of the match */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,-1,var_starts); /* And we restore previous values */ element->m.start_pos_in_token=old_pos_token; element->m.start_pos_in_char=old_pos_char; element->m.start_pos_in_letter=old_pos_letter; u_strcpy(s,old_stack); free(old_stack); /* If we have a $* tag, we know that we can only have just one text tag * with special value -1 */ goto restore_dic_variable; } else if (fst2_tag->type==BEGIN_POSITIVE_CONTEXT_TAG) { fatal_error("problem $[\n"); } } else { current_tag=(TfstTag*)(infos->tfst->tags->tab[text_tags->n]); /* We update the last tag */ last_tag=text_tags->n; /* If the current text tag is not a text independent one */ /* If there are some pending $a( tags, we set them to the current tag */ if (var_starts!=NULL) { struct list_pointer* ptr=(*var_starts); while (ptr!=NULL) { int* start=(int*)(ptr->pointer); (*start)=text_tags->n; ptr=ptr->next; } } int previous_start_token,previous_start_char; if (last_text_dependent_tfst_tag!=-1) { /* If the item is not the first, we must insert the original text that is * between the end of the previous merged text and the beginning of the * current one, typically to insert spaces */ TfstTag* previous_tag=(TfstTag*)(infos->tfst->tags->tab[last_text_dependent_tfst_tag]); previous_start_token=previous_tag->m.end_pos_in_token; previous_start_char=previous_tag->m.end_pos_in_char; /* We start just after the end of the previous match */ if (infos->tfst->token_content[previous_start_token][previous_start_char+1]!='\0') { /* If we were not at the end of the previous text token, we just inscrease * the char position */ previous_start_char++; } else { /* Otherwise, we go on the next token */ previous_start_token++; previous_start_char=0; } } else { /* Otherwise, we start on the beginning of the current text tag */ //error("current item=%d\n",text_tags->n); previous_start_token=current_tag->m.start_pos_in_token; previous_start_char=current_tag->m.start_pos_in_char; } /* Here we have to insert the text that is between current_start and current_end, * and then, the ouput of the fst2 transition */ if (infos->output_policy==MERGE_OUTPUTS) { insert_text_interval_tfst(infos,s,previous_start_token,previous_start_char, current_tag->m.end_pos_in_token,current_tag->m.end_pos_in_char); } } /* Then, we go on the next item */ struct list_pointer* ptr2=NULL; if (element->m.start_pos_in_token==LEFT_CONTEXT_PENDING && current_tag!=NULL) { element->m.start_pos_in_token=infos->tfst->offset_in_tokens+current_tag->m.start_pos_in_token; element->m.start_pos_in_char=current_tag->m.start_pos_in_char; element->m.start_pos_in_letter=current_tag->m.start_pos_in_letter; } explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag ,&ptr2 /* We have encountered a text dependent tag, so there is no * more pending start tag like $a( */ ); element->m=saved_element; /* If there was a $* tag pending */ free_list_pointer(ptr2); if (infos->ambiguous_output_policy==IGNORE_AMBIGUOUS_OUTPUTS) { /* If we don't want ambiguous outputs, then the first path is * enough for our purpose */ goto restore_dic_variable; } text_tags=text_tags->next; remove_chars_from_output_variables(infos->output_variables,captured_chars); /* We reset to 0, because if we exit the while normally, we don't want to * modify output variables twice when reaching the 'restore_dic_variable' * label */ captured_chars=0; } restore_dic_variable: /* We redo this about output variables here, since we may have jumped here directly */ remove_chars_from_output_variables(infos->output_variables,captured_chars); if (capture) { /* If we have a capture variable $:X$, we must restore the previous value * for this dictionary variable */ set_dic_variable(name,old_value_dela,&(infos->dic_variables),0); } }
int cq_dlist_to_update_utf8(char *buf, size_t buflen, struct dlist list, struct drow row) { UChar *buf16; UErrorCode status = U_ZERO_ERROR; size_t num_left = list.fieldc; int rc = 0; if (num_left == 0) return 1; buf16 = calloc(buflen, sizeof(UChar)); if (buf16 == NULL) return -2; for (size_t i = 0; i < list.fieldc; ++i) { if (!strcmp(list.fieldnames[i], list.primkey)) { --num_left; continue; } UChar *ftemp = calloc(buflen, sizeof(UChar)); if (ftemp == NULL) { rc = -3; break; } UChar *vtemp = calloc(buflen, sizeof(UChar)); if (vtemp == NULL) { rc = -4; free(ftemp); break; } u_strFromUTF8(ftemp, buflen, NULL, list.fieldnames[i], strlen(list.fieldnames[i]), &status); if (!U_SUCCESS(status)) { rc = 2; free(ftemp); free(vtemp); break; } u_strFromUTF8(vtemp, buflen, NULL, row.values[i], strlen(row.values[i]), &status); if (!U_SUCCESS(status)) { rc = 3; free(ftemp); free(vtemp); break; } bool isstr = false; for (int32_t j = 0; j < u_strlen(vtemp); ++j) if (!isdigit(vtemp[j])) isstr = true; u_strcat(buf16, ftemp); u_strcat(buf16, u"="); if (isstr) u_strcat(buf16, u"'"); u_strcat(buf16, vtemp); if (isstr) u_strcat(buf16, u"'"); free(ftemp); free(vtemp); if (--num_left > 0) u_strcat(buf16, u","); } u_strToUTF8(buf, buflen, NULL, buf16, u_strlen(buf16), &status); if (!U_SUCCESS(status)) rc = 4; free(buf16); return rc; }
// // this function explores the dictionary to decompose the word mot // void explore_state (int adresse, unichar* current_component, int pos_in_current_component, const unichar* original_word, const unichar* remaining_word, int pos_in_remaining_word, const unichar* decomposition, const unichar* lemma_prefix, struct decomposed_word_list** L, int n_decomp, struct rule_list* rule_list_called, const struct dela_entry* dic_entr_called, const unsigned char* tableau_bin, const struct INF_codes* inf_codes, const bool* prefix,const bool* suffix,const Alphabet* alphabet, U_FILE* debug_file,struct utags UTAG, vector_ptr* rules,vector_ptr* entries) { int c = tableau_bin[adresse]*256+tableau_bin[adresse+1]; int index; int t = 0; if ( !(c&32768) ) { // if we are in a terminal state index = tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4]; current_component[pos_in_current_component] = '\0'; if (pos_in_current_component >= 1) { // go on if word length equals zero #if DDEBUG > 0 { u_fprintf(debug_file,". %S\n",current_component); } #endif struct list_ustring* l = inf_codes->codes[index]; while ( l != 0 ) { // int one_rule_already_matched = 0; // one rule matched each entry is enough unichar entry[MAX_DICT_LINE_LENGTH]; uncompress_entry(current_component, l->string, entry); #if DDEBUG > 0 { u_fprintf(debug_file,": %S\n",entry); } #endif struct dela_entry* dic_entr = new_dic_entry(entry,entries); unichar lemma_prefix_new[MAX_DICT_LINE_LENGTH]; struct rule_list* rule_list_new = 0; unichar next_remaining_word[MAX_WORD_LENGTH]; struct rule_list* rule_list = 0; if (prefix_is_valid(index,prefix) || suffix_is_valid(index,suffix)) rule_list = parse_rules(entry,UTAG,rules); else { rule_list = new_rule_list(rules); rule_list->rule = new_composition_rule(); } // entry is now cleaned from rules for composition and derivation // log decomposition of word // ("cleaned" entries for better overview) unichar decomposition_new[MAX_DICT_LINE_LENGTH]; u_strcpy(decomposition_new, decomposition); if (decomposition_new[0] != '\0') u_strcat(decomposition_new, " +++ "); u_strcat(decomposition_new, entry); // loop on all composition_rules called struct rule_list* called = rule_list_called; do { // while ( rule_list* called != 0 ) // if (one_rule_already_matched) // break; struct composition_rule* rule_called = ( called != 0 ) ? called->rule : 0; // may be undefined // loop on all actual composition_rules struct rule_list* r_list = rule_list; while ( r_list != 0 ) { // if (one_rule_already_matched) // break; struct composition_rule* rule = r_list->rule; // ever defined, see upwards if (remaining_word[pos_in_remaining_word]=='\0' && // we have explored the entire original word ((((dic_entr_called != 0) && composition_rule_matches_entry(rule->before, dic_entr_called,debug_file)) && ((rule_called != 0) && composition_rule_matches_entry(rule_called->after, dic_entr,debug_file))) || // and we have a valid right component, i.e. rules match ((dic_entr_called == 0) && // or a simple entry (i.e. no prefix), (! affix_is_valid(index,prefix,suffix))) // but no affix ) ) { // one_rule_already_matched = 1; unichar inflected[MAX_WORD_LENGTH]; unichar lemma[MAX_WORD_LENGTH]; unichar codes[MAX_DICT_LINE_LENGTH]; tokenize_DELA_line_into_3_parts(entry, inflected, lemma, codes); /* generating new lexicon entry */ unichar new_dela_line[MAX_DICT_LINE_LENGTH]; /* word form */ u_strcpy(new_dela_line, original_word); u_strcat(new_dela_line, ","); /* lemma */ // lemmatize word if (rule->then.repl[0] == '\0' // if there are no replace codes && (rule_called != 0 // either in actual nor in preceeding rule && rule_called->then.repl[0] == '\0')) { u_strcat(new_dela_line, lemma_prefix); unichar affix[MAX_WORD_LENGTH]; u_strcpy(affix, lemma); substring_operation(affix, rule->then.substr_act); if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0') substring_operation(affix, rule_called->then.undo_substr_next); u_strcat(new_dela_line, affix); } else { u_strcat(new_dela_line, original_word); } /* codes */ u_strcat(new_dela_line,"."); if (rule->then.repl[0] != '\0') { // replacing codes by u_strcat(new_dela_line,rule->then.repl); // suffix' ones } else if (rule_called == 0) { // prohibit SGV u_strcat(new_dela_line,codes); } else if (rule_called->then.repl[0] != '\0') { u_strcat(new_dela_line,rule_called->then.repl); // prefix' ones } // replace replaces all and blocks adding and deleting // maybe this is not optimal ??? else { if (rule_called->then.add[0] != '\0') { // add codes if (!dic_entry_contain_gram_code(dic_entr, rule_called->then.add)) { bool done = 0; unichar tmp[MAX_COMPOSITION_RULE_LENGTH]; int j = 0; for (int i = 0; codes[i] != '\0'; i++) { if (codes[i] == ':' && (!done)) { tmp[j++] = '+'; tmp[j] = '\0'; u_strcat(new_dela_line,tmp); u_strcat(new_dela_line,rule_called->then.add); done = 1; j = 0; } tmp[j++] = codes[i]; } tmp[j] = '\0'; u_strcat(new_dela_line,tmp); if (!done) { u_strcat(new_dela_line,"+"); u_strcat(new_dela_line,rule_called->then.add); } } else { u_strcat(new_dela_line,codes); } } else if (rule_called->then.del[0] != '\0') { // delete codes } else { u_strcat(new_dela_line,codes); } } #if DDEBUG > 0 { u_fprintf(debug_file,"= %S\n",new_dela_line); } #endif struct decomposed_word* wd = new_decomposed_word(); wd->n_parts = n_decomp; u_strcpy(wd->decomposition,decomposition_new); u_strcpy(wd->dela_line,new_dela_line); struct decomposed_word_list* wdl=new_decomposed_word_list(); // unshift actual decomposition to decomposition list L wdl->element = wd; wdl->suivant = (*L); (*L) = wdl; } // end if end of word and valid right component else if // beginning or middle of word: explore the rest of the original word (prefix_is_valid(index,prefix) && check_is_valid(UTAG.PREFIX, dic_entr) && // but only if the current component was a valid left one // we go on with the next component ( (n_decomp == 1) // prefix as first part of a word: no rule matching || ( // prefix in the middle of a word (rule_called && composition_rule_matches_entry(rule_called->after, dic_entr,debug_file)) && (dic_entr_called && composition_rule_matches_entry(rule->before, dic_entr_called,debug_file)) ) )) { // one_rule_already_matched = 1; u_strcpy(lemma_prefix_new, lemma_prefix); unichar affix[MAX_WORD_LENGTH]; u_strcpy(affix, current_component); if (rule_called != 0 && rule_called->then.undo_substr_next[0] != '\0') { substring_operation(affix, rule_called->then.undo_substr_next); u_fprintf(debug_file,"yes\n"); } substring_operation(affix, rule->then.substr_act); u_strcat(lemma_prefix_new, affix); int j = 0; for (int i = pos_in_remaining_word; remaining_word[i] != '\0'; i++) { next_remaining_word[j++] = remaining_word[i]; } next_remaining_word[j] = '\0'; if (rule->then.substr_next[0] != '\0') { substring_operation(next_remaining_word, rule->then.substr_next); #if DDEBUG > 0 { u_fprintf(debug_file,"| %S|%S\n",affix,next_remaining_word); } #endif } #if DDEBUG > 0 { u_fprintf(debug_file,"- %S\n",entry); } #endif struct rule_list* tmp = new_rule_list(rules); tmp->rule = new_composition_rule(); copy_composition_rule(tmp->rule, rule); tmp->next = 0; if ( rule_list_new == 0 ) { rule_list_new = tmp; } else { struct rule_list* trl = rule_list_new; while ( trl->next != 0 ) { trl=trl->next; } trl->next = tmp; } } else { // no valid suffix nor prefix } r_list = r_list->next; } // while ( rule_list* r_list != 0 ) if ( called != 0 ) called = called->next; } while ( called != 0 ); // prefix found, try to decomposite rest of word if ( rule_list_new != 0 && dic_entr != 0 ) { unichar next_component[MAX_WORD_LENGTH]; #if DDEBUG > 0 { u_fprintf(debug_file,"> %S\n",next_remaining_word); } #endif explore_state(4, next_component, 0, original_word, next_remaining_word, 0, decomposition_new, lemma_prefix_new, L, n_decomp+1, rule_list_new, dic_entr, tableau_bin,inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries); } else { // free_dic_entry(dic_entr); // free_rule_list(rule_list); } l = l->next; } // end of while (token_list* l != 0) t = adresse+5; } // end of word length >= 1 } else { // not a final state c = c-32768; t = adresse+2; } if (remaining_word[pos_in_remaining_word]=='\0') { // if we have finished, we return // free_dic_entry(dic_entr_called); // free_rule_list(rule_list_called); return; } // if not, we go on with the next letter for (int i=0;i<c;i++) { if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]), remaining_word[pos_in_remaining_word], alphabet) || is_equal_or_uppercase(remaining_word[pos_in_remaining_word], (unichar)(tableau_bin[t]*256+tableau_bin[t+1]), alphabet)) { index = tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4]; current_component[pos_in_current_component] = (unichar)(tableau_bin[t]*256+tableau_bin[t+1]); explore_state(index, current_component, pos_in_current_component+1, original_word, remaining_word, pos_in_remaining_word+1, decomposition, lemma_prefix, L, n_decomp, rule_list_called, dic_entr_called, tableau_bin, inf_codes,prefix,suffix,alphabet,debug_file,UTAG,rules,entries); } t += 5; } }
/** * This explores the dictionary in order decompose the given word into a valid sequence * of simple words. For instance, if we have the word "Sommervarmt", we will first * explore the dictionary and find that "sommer" is a valid left component that * corresponds to the dictionary entry "sommer,.N:msia". Then we will * look if the following word "varmt" is in the dictionary. It is * the case, with the entry "varmt,varm.A:nsio". As we are at the end of the word to * analyze and as "varmt" is a valid rightmost component, we will generate an entry * according to the following things: * * 'output_dela_line'="sommervarmt,sommervarm.A:nsio" * 'analysis'="sommer,.N:msia +++ varmt,varm.A:nsio" * 'number_of_components'=2 * * Note that the initial "S" was put in lowercase, because the dictionary * contains "sommer" and not "Sommer". The lemma is obtained with * the lemma of the rightmost component (here "varm"), and the word inherits * from the grammatical information of its rightmost component. * * 'offset': offset of the current node in the binary array 'infos->bin' * 'current_component': string that represents the current simple word * 'pos_in_current_component': position in the string 'current_component' * 'word_to_analyze': the word to analyze * 'pos_in_word_to_analyze': position in the string 'word_to_analyze' * 'analysis': string that represents the analysis as a concatenation like * "sommer,.N:msia +++ varmt,varm.A:nsio" * 'output_dela_line': string that contains the final DELA line. The lemma is * obtained by replacing the rightmost term of * the word to analyze by its lemma. * 'L': list of all analysis for the given word * 'number_of_components': number of components that compose the word. * 'infos': global settings. */ void explore_state(int offset,unichar* current_component,int pos_in_current_component, const unichar* word_to_analyze,int pos_in_word_to_analyze,const unichar* analysis, const unichar* output_dela_line,struct word_decomposition_list** L, int number_of_components,struct norwegian_infos* infos) { int c; int index,t; c=infos->bin[offset]*256+infos->bin[offset+1]; if (!(c&32768)) { /* If we are in a final state, we compute the index of the * corresponding INF line */ index=infos->bin[offset+2]*256*256+infos->bin[offset+3]*256+infos->bin[offset+4]; /* We can set the end of our current component */ current_component[pos_in_current_component]='\0'; /* We do not consider words of length 1 */ if (pos_in_current_component>1) { /* We don't consider components with a length of 1 */ if (word_to_analyze[pos_in_word_to_analyze]=='\0') { /* If we have explored the entire original word */ if (get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX) { /* And if we do not have forbidden word in last position */ struct list_ustring* l=infos->inf->codes[index]; /* We will look at all the INF codes of the last component in order * to produce analysis */ while (l!=NULL) { unichar dec[2000]; u_strcpy(dec,analysis); if (dec[0]!='\0') { /* If we have already something in the analysis (i.e. if * we have not a simple word), we insert the concatenation * mark before the entry to come */ u_strcat(dec," +++ "); } unichar entry[2000]; /* We get the dictionary line that corresponds to the current INF code */ uncompress_entry(current_component,l->string,entry); /* And we add it to the analysis */ u_strcat(dec,entry); unichar new_dela_line[2000]; /* We copy the current output DELA line that contains * the concatenation of the previous components */ u_strcpy(new_dela_line,output_dela_line); /* Then we tokenize the DELA line that corresponds the current INF * code in order to obtain its lemma and grammatical/inflectional * information */ struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1); /* We concatenate the inflected form of the last component to * the output DELA line */ u_strcat(new_dela_line,tmp_entry->inflected); /* We put the comma that separates the inflected form and the lemma */ u_strcat(new_dela_line,","); /* And we build the lemma in the same way than the inflected form */ u_strcat(new_dela_line,output_dela_line); u_strcat(new_dela_line,tmp_entry->lemma); /* We put the dot that separates the the lemma and the grammatical/inflectional * information */ u_strcat(new_dela_line,"."); /* And finally we put the grammatical/inflectional information */ u_strcat(new_dela_line,tmp_entry->semantic_codes[0]); int k; for (k=1;k<tmp_entry->n_semantic_codes;k++) { u_strcat(new_dela_line,"+"); u_strcat(new_dela_line,tmp_entry->semantic_codes[k]); } for (k=0;k<tmp_entry->n_inflectional_codes;k++) { u_strcat(new_dela_line,":"); u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]); } free_dela_entry(tmp_entry); /* * Now we can build an analysis in the form of a word decomposition * structure, but only if the last component is a valid * right one or if it is a verb long enough, or if we find out * that the word to analyze was in fact a simple word * in the dictionary */ if (verb_of_more_than_4_letters(entry) || check_valid_right_component_for_one_INF_code(l->string) || number_of_components==1) { /* * We set the number of components, the analysis, the actual * DELA line and information about */ struct word_decomposition* wd=new_word_decomposition(); wd->n_parts=number_of_components; u_strcpy(wd->decomposition,dec); u_strcpy(wd->dela_line,new_dela_line); wd->is_a_valid_right_N=check_N_right_component(l->string); wd->is_a_valid_right_A=check_A_right_component(l->string); /* Then we add the decomposition word structure to the list that * contains all the analysis for the word to analyze */ struct word_decomposition_list* wdl=new_word_decomposition_list(); wdl->element=wd; wdl->next=(*L); (*L)=wdl; } /* We go on with the next INF code of the last component */ l=l->next; } } /* If are at the end of the word to analyze, we have nothing more to do */ return; } else { /* If we are not at the end of the word to analyze, we must * 1) look if the current component is a valid left one * 2) look if it is not a forbidden component and * 3) explore the rest of the original word */ if (infos->valid_left_component[index] && (get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX)) { /* If we have a valid component, we look first if we are * in the case of a word ending by a double letter like "kupp" */ if (pos_in_current_component>2 && (current_component[pos_in_current_component-1]==current_component[pos_in_current_component-2])) { /* If we have such a word, we add it to the current analysis, * putting "+++" if the current component is not the first one */ unichar dec[2000]; u_strcpy(dec,analysis); if (dec[0]!='\0') { u_strcat(dec," +++ "); } /* In order to print the component in the analysis, we arbitrary * take a valid left component among all those that are available * for the current component */ unichar sia_code[2000]; unichar entry[2000]; unichar line[2000]; get_first_valid_left_component(infos->inf->codes[index],sia_code); uncompress_entry(current_component,sia_code,entry); u_strcat(dec,entry); u_strcpy(line,output_dela_line); u_strcat(line,current_component); /* As we have a double letter at the end of the word, * we must remove a character */ line[u_strlen(line)-1]='\0'; unichar temp[2000]; unichar dec_temp[2000]; u_strcpy(dec_temp,dec); /* Then, we explore the dictionary in order to analyze the * next component. We start at the root of the dictionary * (offset=4) and we go back one position in the word to analyze. * For instance, if we have "kupplaner", we read "kupp" and then * we try to analyze "planner". */ explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze-1, dec_temp,line,L,number_of_components+1,infos); } /* Now, we try to analyze the component normally, even if * it was ended by double letter, because we can have things * like "oppbrent = opp,.ADV +++ brent,brenne.V:K" */ unichar dec[2000]; unichar line[2000]; u_strcpy(dec,analysis); if (dec[0]!='\0') { /* We add the "+++" mark if the current component is not the first one */ u_strcat(dec," +++ "); } unichar sia_code[2000]; unichar entry[2000]; /* In order to print the component in the analysis, we arbitrary * take a valid left component among all those that are available * for the current component */ get_first_valid_left_component(infos->inf->codes[index],sia_code); uncompress_entry(current_component,sia_code,entry); u_strcat(dec,entry); u_strcpy(line,output_dela_line); u_strcat(line,current_component); unichar temp[2000]; unichar dec_temp[2000]; u_strcpy(dec_temp,dec); /* Then, we explore the dictionary in order to analyze the * next component. We start at the root of the dictionary * (offset=4). */ explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze, dec_temp,line,L,number_of_components+1,infos); } } } /* Once we have finished to deal with the current final dictionary node, * we go on because we may match a longer word */ t=offset+5; } else { /* If the node is not a final one, we get compute the number of transitions by * removing the highest bit */ c=c-32768; t=offset+2; } /* We examine each transition that goes out from the node */ for (int i=0;i<c;i++) { if (is_equal_or_uppercase((unichar)(infos->bin[t]*256+infos->bin[t+1]),word_to_analyze[pos_in_word_to_analyze],infos->alphabet)) { /* If the transition's letter is case compatible with the current letter of the * word to analyze, we follow it */ index=infos->bin[t+2]*256*256+infos->bin[t+3]*256+infos->bin[t+4]; current_component[pos_in_current_component]=(unichar)(infos->bin[t]*256+infos->bin[t+1]); explore_state(index,current_component,pos_in_current_component+1,word_to_analyze,pos_in_word_to_analyze+1, analysis,output_dela_line,L,number_of_components,infos); } /* We move the offset to the next transition */ t=t+5; } }
// // this function explores the dictionary to decompose the word mot // void explore_state_german(int adresse,unichar* current_component,int pos_in_current_component, const unichar* original_word,int pos_in_original_word,const unichar* decomposition, unichar* dela_line,struct german_word_decomposition_list** L,int n_decomp, const char* left,const char* right, const struct INF_codes* inf_codes,const Alphabet* alphabet, const unsigned char* tableau_bin) { int c; int index,t; c=tableau_bin[adresse]*256+tableau_bin[adresse+1]; if (!(c&32768)) { // if we are in a terminal state index=tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4]; current_component[pos_in_current_component]='\0'; if (pos_in_current_component>1) { // we don't consider words with a length of 1 if (original_word[pos_in_original_word]=='\0') { // if we have explored the entire original word if (right[index]) { // and if we have a valid right component struct list_ustring* l=inf_codes->codes[index]; while (l!=NULL) { unichar dec[500]; u_strcpy(dec,decomposition); if (dec[0]!='\0') {u_strcat(dec," +++ ");} unichar entry[500]; uncompress_entry(current_component,l->string,entry); u_strcat(dec,entry); unichar new_dela_line[500]; struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1); if (tmp_entry==NULL) { /* If there was an error in the dictionary, we skip the entry */ l=l->next; continue; } // change case if there is a prefix // prefixes are downcase, nouns (=suffixes) uppercase: // "investitionsObjekte" -> "Investitionsobjekte" if ( u_strlen(dela_line) != 0 ) { // capitalize dela_line dela_line[0] = u_toupper((unichar) dela_line[0]); // downcase lemma and inflected tmp_entry->inflected[0] = u_tolower(tmp_entry->inflected[0]); tmp_entry->lemma[0] = u_tolower(tmp_entry->lemma[0]); } u_strcpy(new_dela_line,dela_line); u_strcat(new_dela_line,tmp_entry->inflected); u_strcat(new_dela_line,","); u_strcat(new_dela_line,dela_line); u_strcat(new_dela_line,tmp_entry->lemma); u_strcat(new_dela_line,"."); u_strcat(new_dela_line,tmp_entry->semantic_codes[0]); int k; for (k=1;k<tmp_entry->n_semantic_codes;k++) { u_strcat(new_dela_line,"+"); u_strcat(new_dela_line,tmp_entry->semantic_codes[k]); } for (k=0;k<tmp_entry->n_inflectional_codes;k++) { u_strcat(new_dela_line,":"); u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]); } free_dela_entry(tmp_entry); struct german_word_decomposition* wd=new_german_word_decomposition(); wd->n_parts=n_decomp; u_strcpy(wd->decomposition,dec); u_strcpy(wd->dela_line,new_dela_line); if (check_valid_right_component_for_one_INF_code_german(l->string)) { // if we got a correct right component (N-FF) struct german_word_decomposition_list* wdl=new_german_word_decomposition_list(); wdl->element=wd; wdl->suivant=(*L); (*L)=wdl; } else { free_german_word_decomposition(wd); } l=l->next; } } } else { // else, we must explore the rest of the original word if (left[index]) { // but only if the current component was a valid left one // we go on with the next component unichar dec[2000]; unichar line[500]; u_strcpy(dec,decomposition); if (dec[0]!='\0') {u_strcat(dec," +++ ");} unichar sia_code[500]; unichar entry[500]; get_first_sia_code_german(index,sia_code,inf_codes); uncompress_entry(current_component,sia_code,entry); u_strcat(dec,entry); u_strcpy(line,dela_line); u_strcat(line,current_component); unichar temp[500]; explore_state_german(4,temp,0,original_word,pos_in_original_word, dec,line,L,n_decomp+1,left,right,inf_codes,alphabet,tableau_bin); } } } t=adresse+5; } else { c=c-32768; t=adresse+2; } if (original_word[pos_in_original_word]=='\0') { // if we have finished, we return return; } // if not, we go on with the next letter for (int i=0;i<c;i++) { if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),original_word[pos_in_original_word],alphabet) || is_equal_or_uppercase(original_word[pos_in_original_word],(unichar)(tableau_bin[t]*256+tableau_bin[t+1]),alphabet)) { index=tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4]; current_component[pos_in_current_component]=(unichar)(tableau_bin[t]*256+tableau_bin[t+1]); explore_state_german(index,current_component,pos_in_current_component+1,original_word,pos_in_original_word+1, decomposition,dela_line,L,n_decomp,left,right,inf_codes,alphabet,tableau_bin); } t=t+5; } }
/** * Computes training by extracting statistics from a tagged corpus file. */ void do_training(U_FILE* input_text,U_FILE* rforms_file,U_FILE* iforms_file){ /* these two hash tables are respectively for simple and compound entries */ struct string_hash_ptr* rforms_table = NULL, *iforms_table = NULL; if(rforms_file != NULL){ rforms_table = new_string_hash_ptr(200000); } if(iforms_file != NULL){ iforms_table = new_string_hash_ptr(200000); } /* we initialize a contextual matrix */ struct corpus_entry** context = new_context_matrix(); initialize_context_matrix(context); unichar line[MAX_TAGGED_CORPUS_LINE]; /* check the format of the corpus */ long previous_file_position = ftell(input_text); if(u_fgets(line,input_text) == EOF){ fatal_error("File is empty"); } fseek(input_text,previous_file_position,SEEK_SET); int format_corpus = check_corpus_entry(line); if(format_corpus == 0){ // the corpus is in the Tagger format, one word per line where line=word/tag while(u_fgets(line,input_text) !=EOF){ if(u_strlen(line) == 0){ initialize_context_matrix(context); } else{ corpus_entry* entry = new_corpus_entry(line); if(u_strchr(line,'_')!=NULL && line[0]!='_'){ corpus_entry** entries = extract_simple_words(entry); free_corpus_entry(entry); for(int i=0;entries[i]!=NULL;i++){ push_corpus_entry(entries[i],context); add_statistics(context,rforms_table,iforms_table); } free(entries); } else { push_corpus_entry(entry,context); add_statistics(context,rforms_table,iforms_table); } } } } else { // the corpus is in the Unitex tagged format, one sentence per line where token={word,lemma.tag} unichar *tmp,*s = (unichar*)malloc(sizeof(unichar)*(MAX_TAGGED_CORPUS_LINE)); int current_len,len; unsigned int i; while(u_fgets(line,input_text) != EOF){ current_len = 0, len = 0; /* extract each token of the sentence */ for (;;) { len = 1+u_strlen(line+current_len)-u_strlen(u_strchr(line+current_len,'}')); tmp = u_strcpy_sized(s,len-1,line+current_len+1); u_strcat(tmp,"\0"); if(u_strcmp(s,"S") == 0) break; //particular case: '\},\}.PONCT' if(line[current_len+2] == '}'){ int start = current_len+3; do{ tmp = u_strchr(line+start,'}'); start += 1+u_strlen(line+start)-u_strlen(tmp); } while(*(tmp+1) != ' '); tmp = u_strcpy_sized(s,start-current_len-1,line+current_len+1); u_strcat(tmp,"\0"); len += start-current_len-3; } /* format the {XX.YY} into standard tagger format, XX/YY */ unichar* newline = (unichar*)malloc(sizeof(unichar)*(8096)); if(u_strchr(s,',')[1] == ','){ u_strcpy(newline,","); } else u_strcpy_sized(newline,1+u_strlen(s)-u_strlen(u_strchr(s,',')),s); u_sprintf(newline,"%S/%S\0",newline,s+u_strrchr(s,'.')+1); for(i=0;i<u_strlen(newline);i++){ if(newline[i] == ' ') newline[i] = '_'; } //create corpus entry corpus_entry* entry = new_corpus_entry(newline); if(u_strchr(newline,'_') != NULL && newline[0] != '_'){ corpus_entry** entries = extract_simple_words(entry); free_corpus_entry(entry); for(int j=0;entries[j]!=NULL;j++){ push_corpus_entry(entries[j],context); add_statistics(context,rforms_table,iforms_table); } free(entries); } else { push_corpus_entry(entry,context); add_statistics(context,rforms_table,iforms_table); } free(newline); current_len += len+1; } initialize_context_matrix(context); } free(s); } free_context_matrix(context); /* we fill dictionary files with pairs (tuple,value) and then * we add a special line "CODE\tFEATURES,.value" in order to * specify whether the dictionary contains inflected or raw form tuples*/ unichar* str = u_strdup(""); if(rforms_table != NULL){ write_keys_values(rforms_table,rforms_table->hash->root,str,rforms_file); u_fprintf(rforms_file,"%s,.%d\n","CODE\tFEATURES",0); free_string_hash_ptr(rforms_table,NULL); } if(iforms_table != NULL){ write_keys_values(iforms_table,iforms_table->hash->root,str,iforms_file); u_fprintf(iforms_file,"%s,.%d\n","CODE\tFEATURES",1); free_string_hash_ptr(iforms_table,NULL); } free(str); }
/** * This function produces a normalized version of 'input' and stores it into 'ouput'. * The following rules are applied in the given order: * * 1) If there is a { at the current position, we try to read a {S}, a {STOP} or * a tag token like {today,.ADV}. If we fail, we replace the { and the }, if any, * according to the replacement rules. Otherwise, we let the token unchanged. * 2) If there is one or more replacement rules that can apply to the current * position in 'input', then we apply the longest one. * 3) If we we find a separator (space, tab, new line) sequence, we replace it: * - by a new line if the sequence contains one and if 'carriage_return_policy' is * set to KEEP_CARRIAGE_RETURN; * - by a space otherwise. * 4) We copy the character that was read to the output. * * Note that 'replacements' is supposed to contain replacement rules for { and } */ int normalize(const char *fin, const char *fout, Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input, int carriage_return_policy, const char *rules) { U_FILE* input; input = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,fin,U_READ); if (input == NULL) { error("Cannot open file %s\n", fin); return 1; } U_FILE* output; output = u_fopen_creating_versatile_encoding(encoding_output,bom_output,fout,U_WRITE); if (output == NULL) { error("Cannot create file %s\n", fout); u_fclose(input); return 1; } struct string_hash* replacements=NULL; if(rules != NULL && rules[0]!='\0') { replacements=load_key_value_list(rules,mask_encoding_compatibility_input,'\t'); if (replacements==NULL) { error("Cannot load replacement rules file %s\n", rules); replacements=new_string_hash(); } } /* If there is no replacement rules file, we simulate one */ else { replacements=new_string_hash(); } /* If there is a replacement rule file, we ensure that there are replacement * rules for { and }. If not, we add our default ones, so that in any case, * we are sure to have rules for { and } */ unichar key[2]; unichar value[2]; u_strcpy(key,"{"); u_strcpy(value,"["); get_value_index(key,replacements,INSERT_IF_NEEDED,value); u_strcpy(key,"}"); u_strcpy(value,"]"); get_value_index(key,replacements,INSERT_IF_NEEDED,value); struct OUTBUF OutBuf; OutBuf.pos=0; unichar tmp[MAX_TAG_LENGTH]; //struct buffer* buffer=new_buffer_for_file(UNICHAR_BUFFER,input); long save_pos=ftell(input); fseek(input,0,SEEK_END); long file_size_input=ftell(input); fseek(input,save_pos,SEEK_SET); int line_buffer_size = (int)(((file_size_input+1) < MAX_LINE_BUFFER_SIZE) ? (file_size_input+1) : MAX_LINE_BUFFER_SIZE); unichar *line_read; line_read=(unichar*)malloc((line_buffer_size+0x10)*sizeof(unichar)); if (line_read==NULL) { fatal_alloc_error("normalize"); } /* We define some things that will be used for parsing the buffer */ static const unichar stop_chars[]= { '{', '}', 0 }; static const unichar forbidden_chars[]= { '\n', 0 }; static const unichar open_bracket[]= { '{', 0 }; static const unichar close_bracket[]= { '}', 0 }; static const unichar empty_string[]= { 0 }; int corrupted_file=0; int eof_found=0; /* First, we fill the buffer */ int lastline_was_terminated=0; while (eof_found==0) { int current_start_pos=0; int found_null=0; const unichar*buff=line_read; int result_read = 0; result_read = u_fgets_treat_cr_as_lf(line_read,line_buffer_size,input,1,&found_null); if ((found_null != 0) && (corrupted_file==0)) { corrupted_file=1; error("Corrupted text file containing NULL characters!\n"); error("They have been ignored by Normalize, but you should clean your text\n"); } if (result_read>0) if (line_read[result_read-1]==0x0d) line_read[result_read-1]='\n'; if (result_read==EOF) break; if (lastline_was_terminated != 0) while (current_start_pos<result_read) { if (buff[current_start_pos]!=' ' && buff[current_start_pos]!='\t' && buff[current_start_pos]!=0x0d && buff[current_start_pos]!='\n') break; current_start_pos++; } lastline_was_terminated = 0; if (result_read > 0) if ((buff[result_read-1]=='\n') || (buff[result_read-1]==0x0d)) lastline_was_terminated = 1; while (current_start_pos<result_read) { if ((lastline_was_terminated == 0) && (eof_found == 0) && (current_start_pos + MINIMAL_CHAR_IN_BUFFER_BEFORE_CONTINUE_LINE >= result_read)) { int i; int nb_to_keep = result_read-current_start_pos; for (i=0;i<nb_to_keep;i++) line_read[i]=line_read[current_start_pos+i]; int found_null_read=0; int result_read_continue = u_fgets_treat_cr_as_lf(line_read+nb_to_keep,line_buffer_size-nb_to_keep,input,1,&found_null_read); if ((found_null_read != 0) && (corrupted_file==0)) { corrupted_file=1; error("Corrupted text file containing NULL characters!\n"); error("They have been ignored by Normalize, but you should clean your text\n"); } if (result_read_continue>0) if (line_read[(result_read_continue+nb_to_keep)-1]==0x0d) line_read[(result_read_continue+nb_to_keep)-1]='\n'; lastline_was_terminated = 0; if (result_read_continue==EOF) eof_found = lastline_was_terminated = 1; if (result_read_continue > 0) if ((buff[(result_read_continue+nb_to_keep)-1]=='\n') || (buff[(result_read_continue+nb_to_keep)-1]==0x0d)) lastline_was_terminated = 1; result_read = nb_to_keep; current_start_pos = 0; if (result_read_continue > 0) result_read += result_read_continue; } if (buff[current_start_pos]=='{') { /* If we have a {, we try to find a sequence like {....}, that does not contain * new lines. If the sequence contains protected character, we want to keep them * protected. */ int old_position=current_start_pos; /* If we don't increase the position, the parse will stop on the initial { */ current_start_pos++; tmp[0]='{'; int code=parse_string(buff,¤t_start_pos,&(tmp[1]),stop_chars,forbidden_chars,NULL); if (code==P_FORBIDDEN_CHAR || code==P_BACKSLASH_AT_END || buff[current_start_pos]!='}') { /* If we have found a new line or a {, or if there is * a backslash at the end of the buffer, or if we have reached the end * of the buffer, we assume that the initial * { was not a tag beginning, so we print the substitute of { */ WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0); /* And we rewind the current position after the { */ current_start_pos=old_position+1; } else { /* If we have read a sequence like {....}, we assume that there won't be * a buffer overflow if we add the } */ u_strcat(tmp,close_bracket); if (!u_strcmp(tmp,"{S}") || !u_strcmp(tmp,"{STOP}") || check_tag_token(tmp)) { /* If this is a special tag or a valid tag token, we just print * it to the output */ WriteOufBuf(&OutBuf,tmp,output, 0); current_start_pos++; } else { /* If we have a non valid tag token, we print the equivalent of { * and we rewind the current position after the { */ WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0); current_start_pos=old_position+1; } } } else { /* If we have a character that is not {, first we try to look if there * is a replacement to do */ int key_length; int index=get_longest_key_index(&buff[current_start_pos],&key_length,replacements); if (index!=NO_VALUE_INDEX) { /* If there is something to replace */ WriteOufBuf(&OutBuf,replacements->value[index],output, 0); current_start_pos=current_start_pos+key_length; } else { if (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { /* If we have a separator, we try to read the longest separator sequence * that we can read. By the way, we note if it contains a new line */ int new_line=0; while (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { /* Note 1: no bound check is needed, since an unichar buffer is always * ended by a \0 * * Note 2: we don't take into account the case of a buffer ended by * separator while it's not the end of file: that would mean * that the text contains something like MARGIN_BEFORE_BUFFER_END * contiguous separators. Such a text would not be a reasonable one. */ if (buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { new_line=1; } current_start_pos++; } if (new_line && (carriage_return_policy==KEEP_CARRIAGE_RETURN)) { /* We print a new line if the sequence contains one and if we are * allowed to; otherwise, we print a space. */ WriteOufBuf(&OutBuf,'\n',output, 0); } else { WriteOufBuf(&OutBuf,' ',output, 0); } } else { /* If, finally, we have a normal character to normalize, we just print it */ WriteOufBuf(&OutBuf,buff[current_start_pos++],output, 0); } } } } } WriteOufBuf(&OutBuf,empty_string,output, 1); free(line_read); free_string_hash(replacements); u_fclose(input); u_fclose(output); return 0; }
/** * This function explore the normalization grammar to construct * the normalization tree. If the 'list' parameter is NULL, then we * are in the main call to the main graph; otherwise, we are within * a subgraph. */ void explore_normalization_fst2(Fst2* fst2,int current_state, struct normalization_tree* node, struct string_hash* tokens,const unichar* output, const Alphabet* alph,struct norm_info** list) { Fst2State state=fst2->states[current_state]; if (is_final_state(state)) { /* If we are in a final state, we behave differently if we are in a subgraph * or in the main call to the main graph. */ if (list!=NULL) { (*list)=insert_in_norm_info_list(output,node,(*list)); } else { node->outputs=sorted_insert(output,node->outputs); } } Transition* trans=state->transitions; unichar tmp[1024]; while (trans!=NULL) { if (trans->tag_number<0) { /* Case of a subgraph call */ struct norm_info* tmp_list=NULL; explore_normalization_fst2(fst2,fst2->initial_states[-(trans->tag_number)],node, tokens,output,alph,&tmp_list); while (tmp_list!=NULL) { /* We continue to explore the current graph */ explore_normalization_fst2(fst2,trans->state_number,tmp_list->node, tokens,tmp_list->output,alph,list); struct norm_info* z=tmp_list; tmp_list=tmp_list->next; free_norm_info(z); } } else { /* If we have a normal transition */ Fst2Tag tag=fst2->tags[trans->tag_number]; u_strcpy(tmp,output); u_strcat(tmp," "); if (tag->output!=NULL && tag->output[0]!='\0' && u_strcmp(tag->output,"<E>") && !only_spaces(tag->output)) { /* We append the output if it exists and is not epsilon */ u_strcat(tmp,tag->output); } if (!u_strcmp(tag->input,"<E>")) { /* If we have an epsilon transition, we go on in the fst2, but * we don't move in the normalization tree */ explore_normalization_fst2(fst2,trans->state_number,node,tokens,tmp,alph,list); } else { /* If we have a normal transition, we explore all the tokens that match it */ struct list_int* l=get_token_list_for_sequence(tag->input,alph,tokens); while (l!=NULL) { /* Then, we add a branch in the normalization tree for * each token. Note that it may introduce combinatory explosions * if the the fst2 matches large sequences */ struct normalization_tree_transition* trans_norm; trans_norm=get_transition(l->n,node->trans); if (trans_norm==NULL) { /* If the transition does not exist in the tree, we create it */ trans_norm=new_normalization_tree_transition(l->n,new_normalization_tree(),node->trans); node->trans=trans_norm; } explore_normalization_fst2(fst2,trans->state_number,trans_norm->node, tokens,tmp,alph,list); struct list_int* L=l; l=l->next; free(L); } } } trans=trans->next; } }
int composition_rule_matches_entry (const struct pattern* rule, const struct dela_entry* d,U_FILE* #if DDEBUG > 1 debug_file #endif ) { int ok = 1; // "ok = 0;" may be replaced by "return 0;" int flex_code_already_matched = 1; #if DDEBUG > 1 u_strcat(tmp, " trying "); #endif for (int i = 0; i < MAX_NUMBER_OF_COMPOSITION_RULES; i++) { if (rule[i].string[0] == '\0') break; // last rule reached: return 1 #if DDEBUG > 1 { if (rule[i].type == 'f') u_strcat(tmp, ":"); else if (rule[i].YesNo) u_strcat(tmp, "+"); else u_strcat(tmp, "-"); u_strcat(tmp, rule[i].string); } #endif if (rule[i].YesNo) { // rule '+' => pattern must be in entry, too if (rule[i].type == 'g') { if (dic_entry_contain_gram_code(d,rule[i].string)) continue; // rule matched, try next one ok = 0; } else if (rule[i].type == 'f') { if (dic_entry_contain_inflectional_code(d,rule[i].string)) { // rule matched, try next one, but mark flex codes as matched flex_code_already_matched = 2; continue; } else if (flex_code_already_matched == 2) { // no matter if any flex code already matched continue; } else { // no-matches before first match flex_code_already_matched = 0; } } } else { // rule '-' => pattern must not be in entry if (rule[i].type == 'g') { if (dic_entry_contain_gram_code(d,rule[i].string)) ok = 0; } else if (rule[i].type == 'f') { // implemented although not possible in rule syntax if (dic_entry_contain_inflectional_code(d,rule[i].string)) ok = 0; } } } #if DDEBUG > 1 { if (ok && flex_code_already_matched) u_fprintf(debug_file,"\n === matched "); else u_fprintf(debug_file,"\n === not matched "); if ( d->semantic_codes != 0 ) { for (int i = 0; i < d->n_semantic_codes; i++) { u_fprintf(debug_file,"+%S",d->semantic_codes[i]); } } if ( d->inflectional_codes != 0 ) { for (int i = 0; i < d->n_inflectional_codes; i++) { u_fprintf(debug_file,":%S",d->inflectional_codes[i]); } } u_fprintf(debug_file,"\n"); } #endif return (ok && flex_code_already_matched); }
///////////////////////////////////////////////////////////////////////////////// // Puts an inflected multi-word form 'f' corresponding to the DELAC entry 'dlc_entry' into the DELACF format ('entry'). // The resulting enntry may takes up to 'max' characters. // 'entry' almready has its space allocated. // Returns 1 on error, 0 otherwise. int DLC_format_form(struct l_morpho_t* pL_MORPHO,unichar* entry, int max, MU_f_T f, DLC_entry_T* dlc_entry, d_class_equiv_T* D_CLASS_EQUIV) { int l; //length of the entry //Inflected form l = u_strlen(f.form); if (l >= max) return 1; u_strcpy(entry, f.form); //Comma l++; if (l >= max) return 1; u_strcat(entry, ","); //Lemma int u; //index of the current unit in the lemma of the MW form for (u = 0; u < dlc_entry->lemma->no_units; u++) l = l + u_strlen(dlc_entry->lemma->units[u]->form); if (l >= max) return 1; for (u = 0; u < dlc_entry->lemma->no_units; u++) u_strcat(entry, dlc_entry->lemma->units[u]->form); //Full stop l++; if (l >= max) return 1; u_strcat(entry, "."); //Inflection paradigm //l = l + strlen(dlc_entry->lemma->paradigm); //if (l >= max) return 1; //u_strcat(entry,dlc_entry->lemma->paradigm); //Inflection class l = l + u_strlen(d_get_str_class(dlc_entry->lemma->cl, D_CLASS_EQUIV)); if (l >= max) return 1; u_strcat(entry, d_get_str_class(dlc_entry->lemma->cl, D_CLASS_EQUIV)); //Semantic codes int c; //index of the current semantic code for (c = 0; dlc_entry->codes[c]; c++) l = l + u_strlen(dlc_entry->codes[c]) + 1; if (l >= max) return 1; for (c = 0; dlc_entry->codes[c]; c++) { u_strcat(entry, "+"); u_strcat(entry, dlc_entry->codes[c]); } //Inflection features unichar* feat; //sequence of single-letter inflection features, e.g. 'sIf' if (f.features && f.features->no_cats > 0) { feat = d_get_str_feat(pL_MORPHO,f.features); l = l + u_strlen(feat) + 1; //Place for a ':' and all features if (l >= max) return 1; u_strcat(entry, ":"); u_strcat(entry, feat); free(feat); } //Comment if (dlc_entry->comment && u_strlen(dlc_entry->comment)) { l = l + u_strlen(dlc_entry->comment);//Place for a '/' and the comment if (l >= max) return 1; u_strcat(entry, "/"); u_strcat(entry, dlc_entry->comment); } return 0; }