/** * Loads a compound word file, adding each word to the keywords. */ void load_compound_words(char* name,VersatileEncodingConfig* vec, struct string_hash_ptr* keywords) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return; Ustring* line=new_Ustring(256); Ustring* lower=new_Ustring(256); while (EOF!=readline(line,f)) { if (line->str[0]=='{') { /* We skip tags */ continue; } u_strcpy(lower,line->str); u_tolower(lower->str); int index=get_value_index(lower->str,keywords,INSERT_IF_NEEDED,NULL); if (index==-1) { fatal_error("Internal error in load_tokens_by_freq\n"); } KeyWord* value=(KeyWord*)keywords->value[index]; add_keyword(&value,line->str,1); keywords->value[index]=value; } free_Ustring(line); free_Ustring(lower); u_fclose(f); }
/** * Loads the initial keyword list from a tok_by_freq.txt file, * and turns all those tokens in a list whose primary key is the * lower case token: * The/20 THE/2 the/50 => the->(The/20 THE/2 the/50) */ struct string_hash_ptr* load_tokens_by_freq(char* name,VersatileEncodingConfig* vec) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return NULL; Ustring* line=new_Ustring(128); Ustring* lower=new_Ustring(128); struct string_hash_ptr* res=new_string_hash_ptr(1024); int val,pos; /* We skip the first line of the file, containing the number * of tokens */ if (EOF==readline(line,f)) { fatal_error("Invalid empty file %s\n",name); } while (EOF!=readline(line,f)) { if (1!=u_sscanf(line->str,"%d%n",&val,&pos)) { fatal_error("Invalid line in file %s:\n%S\n",name,line->str); } u_strcpy(lower,line->str+pos); u_tolower(lower->str); int index=get_value_index(lower->str,res,INSERT_IF_NEEDED,NULL); if (index==-1) { fatal_error("Internal error in load_tokens_by_freq\n"); } KeyWord* value=(KeyWord*)res->value[index]; res->value[index]=new_KeyWord(val,line->str+pos,value); } free_Ustring(line); free_Ustring(lower); u_fclose(f); return res; }
/** * This function moves outputs from final nodes to transitions leading to final nodes. */ static void subsequential_to_normal_transducer(struct dictionary_node* root, struct dictionary_node* node, struct string_hash* inf_codes, int pos,unichar* z, Ustring* normalizedOutput) { struct dictionary_node_transition* tmp=node->trans; int prefix_set=0; Ustring* prefix=new_Ustring(); while (tmp!=NULL) { z[pos]=tmp->letter; z[pos+1]='\0'; subsequential_to_normal_transducer(root,tmp->node,inf_codes,pos+1,z,normalizedOutput); /* First, if the destination state is final, we place its output on the output * of the current transition */ if (tmp->node->single_INF_code_list!=NULL) { //error("<%S>: output=<%S>\n",z,normalizedOutput->str); tmp->output=u_strdup(inf_codes->value[tmp->node->INF_code]); } if (normalizedOutput->len!=0) { /* Then, we add the normalized output obtained recursively, if any */ //error("<%S>: moving normalized output <%S>\n",z,normalizedOutput->str); if (tmp->output==NULL) { tmp->output=u_strdup(normalizedOutput->str); } else { tmp->output=(unichar*)realloc(tmp->output,sizeof(unichar)*(1+normalizedOutput->len+u_strlen(tmp->output))); } } if (!prefix_set) { prefix_set=1; u_strcpy(prefix,tmp->output); } else { get_longest_common_prefix(prefix,tmp->output); } tmp=tmp->next; } if (node==root || node->single_INF_code_list!=NULL) { /* If we are in the initial state or a final one, we let the transitions as they are, since * their outputs can not move more to the left */ z[pos]='\0'; free_Ustring(prefix); empty(normalizedOutput); return; } tmp=node->trans; while (tmp!=NULL) { //error("prefix removal: <%S> => ",tmp->output); remove_prefix(prefix->len,tmp->output); //error("<%S>\n",tmp->output); tmp=tmp->next; } z[pos]='\0'; u_strcpy(normalizedOutput,prefix); free_Ustring(prefix); }
/** * Prints the given hypotheses to the output, and if needed, * print the word to the modified input file. */ static void display_hypotheses(unichar* word,SpellCheckHypothesis* list,SpellCheckConfig* cfg) { Ustring* line=new_Ustring(128); int printed=0; while (list!=NULL) { printed=1; struct dela_entry* entry=tokenize_DELAF_line(list->entry); if (entry==NULL) { fatal_error("Internal error in display_hypotheses; cannot tokenize entry:\n%S\n",list->entry); } unichar* inflected=entry->inflected; entry->inflected=u_strdup(word); entry->semantic_codes[entry->n_semantic_codes++]=u_strdup("SP_ERR"); u_sprintf(line,"SP_INF=%S",inflected); entry->semantic_codes[entry->n_semantic_codes++]=u_strdup(line->str); dela_entry_to_string(line,entry); u_fprintf(cfg->out,"%S/score=%d\n",line->str,list->score); free(inflected); free_dela_entry(entry); list=list->next; } free_Ustring(line); /* Now, we may have to print the word to the modified input file */ if (cfg->input_op=='M') { /* If we must keep matched words, then we print the word if it had matched */ if (printed) u_fprintf(cfg->modified_input,"%S\n",word); } else if (cfg->input_op=='U') { /* If we must keep unmatched words, then we print the word if it had matched */ if (!printed) u_fprintf(cfg->modified_input,"%S\n",word); } }
/** * This function takes a lexicographic tree with inf codes stored as * integer on nodes, and turns it into a real transducer where outputs * are stored on transitions. */ void move_outputs_on_transitions(struct dictionary_node* root,struct string_hash* inf_codes) { int pos=0; unichar z[0x400]; Ustring* normalizedOutput=new_Ustring(); subsequential_to_normal_transducer(root,root,inf_codes,pos,z,normalizedOutput); free_Ustring(normalizedOutput); }
void lemmatize(struct dela_entry* e,struct string_hash_ptr* keywords,Alphabet* alphabet) { unichar* lower=u_strdup(e->inflected); u_tolower(lower); KeyWord* k_inflected=(KeyWord*)get_value(lower,keywords); free(lower); if (k_inflected==NULL) return; Ustring* tmp=new_Ustring(64); u_sprintf(tmp,"%S.%S",e->lemma,e->semantic_codes[0]); KeyWord* k_lemma=(KeyWord*)get_value(tmp->str,keywords); if (k_lemma==NULL) { k_lemma=new_KeyWord(0,tmp->str,NULL); k_lemma->lemmatized=LEMMATIZED_KEYWORD; get_value_index(tmp->str,keywords,INSERT_IF_NEEDED,k_lemma); } /* Now, we look for all the case compatible tokens, and we add * their weights to the new lemmatized element */ while (k_inflected!=NULL) { if (k_inflected->sequence!=NULL && is_equal_or_uppercase(e->inflected,k_inflected->sequence,alphabet)) { /* We have a match */ k_lemma->weight+=k_inflected->weight; k_inflected->lemmatized=1; } k_inflected=k_inflected->next; } free_Ustring(tmp); }
// // this function reads words in the word file and try analyse them // void analyse_word_list(Dictionary* d, U_FILE* words, U_FILE* result, U_FILE* debug, U_FILE* new_unknown_words, const Alphabet* alph, const bool* prefix,const bool* suffix, struct utags UTAG, vector_ptr* rules, vector_ptr* entries) { u_printf("Analysing russian unknown words...\n"); int n=0; int words_done = 0; Ustring* s=new_Ustring(MAX_WORD_LENGTH); while (EOF!=readline(s,words)) { if (!analyse_word(s->str,d,debug,result,prefix,suffix,alph,UTAG,rules,entries)) { // if the analysis has failed, we store the word in the new unknown word file u_fprintf(new_unknown_words,"%S\n",s->str); } else { n++; } if ( (++words_done % 10000) == 0) u_printf("%d words done", words_done); } free_Ustring(s); u_printf("%d words decomposed as compound words\n",n); }
/** * Loads a match list. Match lists are supposed to have been * generated by the Locate program. */ struct match_list* load_match_list(U_FILE* f,OutputPolicy *output_policy,unichar *header,Abstract_allocator prv_alloc) { struct match_list* l=NULL; struct match_list* end_of_list=NULL; int start,end,start_char,end_char,start_letter,end_letter; Ustring* line=new_Ustring(); char is_an_output; /* We read the header */ unichar foo=0; if (header==NULL) { header=&foo; } u_fscanf(f,"#%C\n",header); OutputPolicy policy; switch(*header) { case 'D': { policy=DEBUG_OUTPUTS; /* In debug mode, we have to skip the debug header */ int n_graphs; u_fscanf(f,"%d\n",&n_graphs); while ((n_graphs--)>-1) { /* -1, because we also have to skip the #[IMR] line */ readline(line,f); } break; } case 'M': policy=MERGE_OUTPUTS; break; case 'R': case 'T': case 'X': policy=REPLACE_OUTPUTS; break; case 'I': default: policy=IGNORE_OUTPUTS; break; } if (output_policy!=NULL) { (*output_policy)=policy; } while (6==u_fscanf(f,"%d.%d.%d %d.%d.%d",&start,&start_char,&start_letter,&end,&end_char,&end_letter)) { /* We look if there is an output or not, i.e. a space or a new line */ int c=u_fgetc(f); if (c==' ') { /* If we have an output to read */ readline(line,f); /* In debug mode, we have to stop at the char #1 */ int i=-1; while (line->str[++i]!=1 && line->str[i]!='\0') { } line->str[i]='\0'; } is_an_output=(policy!=IGNORE_OUTPUTS); if (l==NULL) { l=new_match(start,end,start_char,end_char,start_letter,end_letter,is_an_output?line->str:NULL,-1,NULL,prv_alloc); end_of_list=l; } else { end_of_list->next=new_match(start,end,start_char,end_char,start_letter,end_letter,is_an_output?line->str:NULL,-1,NULL,prv_alloc); end_of_list=end_of_list->next; } } free_Ustring(line); return l; }
/** * This function explores the partial matches that constitute the given match in order to produce * one or all possible outputs, depending on infos->ambiguous_output_policy. * The output(s) is(are) then used to add matches to the infos->matches list. */ void explore_match_to_get_outputs(struct locate_tfst_infos* infos,struct tfst_match* m, struct tfst_simple_match_list* element) { /* As m is a reversed list, we first need to get its elements in the right order */ vector_ptr* items=new_vector_ptr(16); fill_vector(items,m); Ustring* s=new_Ustring(1024); /* In MERGE/REPLACE mode, we have to explore the combination of partial matches */ struct list_pointer* ptr=NULL; explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,0,s,-1,&ptr); free_list_pointer(ptr); free_Ustring(s); free_vector_ptr(items); }
/** * We remove every keyword that is tagged with the forbidden code. If * a forbidden keyword has several tags, all of them are removed: * * the,.DET + the,.XXX => all 'the' keywords are removed */ struct string_hash* compute_forbidden_lemmas(struct string_hash_ptr* keywords,unichar* code) { struct string_hash* hash=new_string_hash(DONT_USE_VALUES,DONT_ENLARGE); Ustring* tmp=new_Ustring(); for (int i=0;i<keywords->size;i++) { KeyWord* list=(KeyWord*)(keywords->value[i]); while (list!=NULL) { if (get_forbidden_keyword(list,code,tmp)) { get_value_index(tmp->str,hash); } list=list->next; } } free_Ustring(tmp); return hash; }
/** * Looks for a keyword that has a forbidden lemma or is a forbidden lemma * if the keyword is not a lemmatized one of the form XXX.YYY */ int has_forbidden_lemma(KeyWord* list,struct string_hash* lemmas) { if (list==NULL || list->sequence==NULL) return 0; int pos=last_index_of(list->sequence,(unichar)'.'); if (pos==-1) { /* If the keyword is not lemmatized, we just test * if it is a forbidden lemma */ return (-1!=get_value_index(list->sequence,lemmas,DONT_INSERT)); } Ustring* tmp=new_Ustring(list->sequence); truncate(tmp,pos); int index=get_value_index(tmp->str,lemmas,DONT_INSERT); free_Ustring(tmp); return index!=-1; }
/** * Loads the given DELAF and modifies the given keywords accordingly by * replacing any non removed token that appear in a DELAF entry * by its lemma. If there are ambiguities, several keywords are * generated. Doing that may merge keywords by adding their weights: * eats/2 + eaten/3 => eat/5 */ void filter_keywords_with_dic(struct string_hash_ptr* keywords,char* name, VersatileEncodingConfig* vec,Alphabet* alphabet) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) { error("Cannot load file %s\n",name); return; } Ustring* line=new_Ustring(128); while (EOF!=readline(line,f)) { struct dela_entry* e=tokenize_DELAF_line(line->str); if (e==NULL) continue; lemmatize(e,keywords,alphabet); free_dela_entry(e); } free_Ustring(line); u_fclose(f); }
/** * Reads the start and end positions of each token stored in the file * produced by Tokenize's --output_offsets option. */ vector_uima_offset* load_uima_offsets(const VersatileEncodingConfig* vec,const char* name) { U_FILE* f; f=u_fopen(vec,name,U_READ); if (f==NULL) { return NULL; } vector_int* v=new_vector_int(); Ustring* line=new_Ustring(); int a,b,c; while (EOF!=readline(line,f)) { u_sscanf(line->str,"%d%d%d",&a,&b,&c); vector_int_add(v,b); vector_int_add(v,c); } free_Ustring(line); u_fclose(f); return (vector_uima_offset*)v; }
/** * Loads the tags of the given .fst2 file. Returns 0 in case of success; -1 otherwise. * Note that the position in the file is unchanged after a call to this function. */ int load_elag_fst2_tags(Elag_fst_file_in* fst) { /* We backup the position in the file, and we come back at the * beginning of the file */ long fpos=ftell(fst->f); rewind(fst->f); /* Now, we go to the tags section, skipping all the automata */ unichar buf[MAXBUF]; int i=0; int len; while (i<fst->nb_automata) { if ((len=u_fgets(buf,MAXBUF,fst->f))==EOF) { error("load_fst_tags: %s: unexpected EOF\n",fst->name); return -1; } if (buf[0]=='f' && isspace(buf[1])) { i++; } /* If we have read the beginning of a long line, we skip the rest of the line */ while ((len==MAXBUF-1) && (buf[len-1]!='\n')) { len=u_fgets(buf,MAXBUF,fst->f); } } Ustring* ustr=new_Ustring(64); while (readline(ustr,fst->f) && ustr->str[0]!='f') { if (ustr->str[0]!='%' && ustr->str[0]!='@') { error("load_fst_tags: %s: bad symbol line: '%S'\n",fst->name,ustr->str); return -1; } /* +1 because we ignore the % or @ at the beginning of the line */ symbol_t* symbol=load_grammar_symbol(fst->language,ustr->str+1); /* If 'symbol' is NULL, then an error message has already * been printed. Moreover, we want to associate NULL to the * string, so that we don't exit the function. Whatever it is, * we add the symbol to the symbols of the .fst2 */ get_value_index(ustr->str+1,fst->symbols,INSERT_IF_NEEDED,symbol); } if (*ustr->str==0) { fatal_error("load_fst_tags: unexpected EOF\n"); } free_Ustring(ustr); /* We set back the position in the file */ fseek(fst->f,fpos,SEEK_SET); return 0; }
/** * Loads the given DELA into the given DELA tree. */ void load_DELA(const VersatileEncodingConfig* vec,const char* name,struct DELA_tree* tree) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) { error("Cannot load dictionary %s\n",name); return; } u_printf("Loading %s...\n",name); Ustring* line=new_Ustring(4096); while (EOF!=readline(line,f)) { struct dela_entry* entry=tokenize_DELAF_line(line->str,1); if (entry!=NULL) { add_entry(tree,entry); } /* We don't need to free the entry, since it's done (if needed) * in the insertion function */ } free_Ustring(line); u_fclose(f); }
/** * Adds a transition to 'automaton'. */ void add_transition(SingleGraph automaton,struct string_hash_ptr* symbols,int from, symbol_t* label,int to) { if (label==SYMBOL_DEF) { if (automaton->states[from]->default_state!=-1) { fatal_error("add_transition: more than one default transition\n"); } automaton->states[from]->default_state=to; return; } while (label!=NULL) { if (label==SYMBOL_DEF) { fatal_error("add_transition: unexpected default transition\n"); } /* We build a string representation of the symbol to avoid * duplicates in the value array */ Ustring* u=new_Ustring(); symbol_to_str(label,u); int n=get_value_index(u->str,symbols,INSERT_IF_NEEDED,label); free_Ustring(u); add_outgoing_transition(automaton->states[from],n,to); label=label->next; } }
// // this function try to analyse an unknown russian word // int analyse_word(const unichar* mot,Dictionary* d,U_FILE* debug,U_FILE* result_file, const bool* prefix,const bool* suffix,const Alphabet* alphabet, struct utags UTAG,vector_ptr* rules,vector_ptr* entries) { #if DDEBUG > 0 { u_fprintf(debug,"\n %S\n",mot); } #endif unichar decomposition[MAX_DICT_LINE_LENGTH]; unichar dela_line[MAX_DICT_LINE_LENGTH]; unichar correct_word[MAX_DICT_LINE_LENGTH]; decomposition[0]='\0'; dela_line[0]='\0'; correct_word[0]='\0'; struct decomposed_word_list* l = 0; Ustring* ustr=new_Ustring(); explore_state(d->initial_state_offset,correct_word,0,mot,mot,0,decomposition,dela_line,&l,1,0,0,d, prefix,suffix,alphabet,debug,UTAG,rules,entries,ustr,0); free_Ustring(ustr); free_all_dic_entries(entries); free_all_rule_lists(rules); if ( l == 0 ) { return 0; } struct decomposed_word_list* tmp = l; while ( tmp != NULL ) { if (debug!=NULL) { u_fprintf(debug,"%S = %S\n",mot,tmp->element->decomposition); } u_fprintf(result_file,"%S\n",tmp->element->dela_line); tmp=tmp->suivant; } free_decomposed_word_list(l); return 1; }
/** * This function reads a file that contains a list of Elag grammar names, * and it compiles them into the file 'outname'. However, if the result * automaton is too big, it will be saved in several automata inside * the output file. */ int compile_elag_rules(char* rulesname,char* outname, const VersatileEncodingConfig* vec,language_t* language) { u_printf("Compilation of %s\n",rulesname); U_FILE* f=NULL; U_FILE* frules=u_fopen(ASCII,rulesname,U_READ); if (frules==NULL) { fatal_error("Cannot open file '%s'\n",rulesname); } U_FILE* out=u_fopen(ASCII,outname,U_WRITE); if (out==NULL) { fatal_error("cannot open file '%s'\n",outname); } /* Name of the file that contains the result automaton */ char fstoutname[FILENAME_MAX]; int nbRules=0; char buf[FILENAME_MAX]; time_t start_time=time(0); Fst2Automaton* res=NULL; Fst2Automaton* A; int fst_number=0; Ustring* ustr=new_Ustring(); char buf2[FILENAME_MAX]; char directory[FILENAME_MAX]; get_path(rulesname,directory); while (af_fgets(buf,FILENAME_MAX,frules->f)) { /* We read one by one the Elag grammar names in the .lst file */ chomp(buf); if (*buf=='\0') { /* If we have an empty line */ continue; } if (!is_absolute_path(buf)) { strcpy(buf2,buf); sprintf(buf,"%s%s",directory,buf2); } u_printf("\n%s...\n",buf); remove_extension(buf); strcat(buf,".elg"); if ((f=u_fopen(ASCII,buf,U_READ))==NULL) { /* If the .elg file doesn't exist, we create one */ remove_extension(buf); u_printf("Precompiling %s.fst2\n",buf); strcat(buf,".fst2"); elRule* rule=new_elRule(buf,vec,language); if (rule==NULL) { fatal_error("Unable to read grammar '%s'\n",buf); } if ((A=compile_elag_rule(rule,language))==NULL) { fatal_error("Unable to compile rule '%s'\n",buf); } free_elRule(rule); } else { /* If there is already .elg, we use it */ u_fclose(f); A=load_elag_grammar_automaton(vec,buf,language); if (A==NULL) { fatal_error("Unable to load '%s'\n",buf); } } if (A->automaton->number_of_states==0) { error("Grammar %s forbids everything!\n",buf); } if (res!=NULL) { /* If there is already an automaton, we intersect it with the new one */ SingleGraph tmp=res->automaton; res->automaton=elag_intersection(language,tmp,A->automaton,GRAMMAR_GRAMMAR); free_SingleGraph(tmp,NULL); free_Fst2Automaton(A,NULL); trim(res->automaton,NULL); } else { res=A; } nbRules++; if (res->automaton->number_of_states>MAX_GRAM_SIZE) { /* If the automaton is too large, we will split the grammar * into several automata */ elag_minimize(res->automaton,1); sprintf(fstoutname,"%s-%d.elg",outname,fst_number++); u_fprintf(out,"<%s>\n",fstoutname); u_printf("Splitting big grammar in '%s' (%d states)\n",fstoutname,res->automaton->number_of_states); u_sprintf(ustr,"%s: compiled elag grammar",fstoutname); free(res->name); res->name=u_strdup(ustr->str); save_automaton(res,fstoutname,vec,FST_GRAMMAR); free_Fst2Automaton(res,NULL); res=NULL; } } if (res!=NULL) { /* We save the last automaton, if any */ sprintf(fstoutname,"%s-%d.elg",outname,fst_number++); u_fprintf(out,"<%s>\n",fstoutname); u_printf("Saving grammar in '%s'(%d states)\n",fstoutname,res->automaton->number_of_states); elag_minimize(res->automaton,1); u_sprintf(ustr,"%s: compiled elag grammar",fstoutname); free(res->name); res->name=u_strdup(ustr->str); save_automaton(res,fstoutname,vec,FST_GRAMMAR); free_Fst2Automaton(res,free_symbol); } time_t end_time=time(0); u_fclose(frules); u_fclose(out); free_Ustring(ustr); u_printf("\nDone.\nElapsed time: %.0f s\n",difftime(end_time,start_time)); u_printf("\n%d rule%s from %s compiled in %s (%d automat%s)\n", nbRules,(nbRules>1)?"s":"",rulesname,outname,fst_number, (fst_number>1)?"a":"on"); return 0; }
///////////////////////////////////////////////////////////////////////////////// // Inflect a DELAS/DELAC into a DELAF/DELACF. // On error returns 1, 0 otherwise. int inflect(char* DLC, char* DLCF, MultiFlex_ctx* p_multiFlex_ctx, Alphabet* alph, int error_check_status) { U_FILE *dlc, *dlcf; //DELAS/DELAC and DELAF/DELACF files unichar output_line[DIC_LINE_SIZE]; //current DELAF/DELACF line int l; //length of the line scanned DLC_entry_T* dlc_entry; MU_forms_T MU_forms; //inflected forms of the MWU int err; //Open DELAS/DELAC dlc = u_fopen(p_multiFlex_ctx->vec, DLC, U_READ); if (!dlc) { return 1; } //Open DELAF/DELACF dlcf = u_fopen(p_multiFlex_ctx->vec, DLCF, U_WRITE); if (!dlcf) { error("Unable to open file: '%s' !\n", DLCF); return 1; } //Inflect one entry at a time Ustring* input_line=new_Ustring(DIC_LINE_SIZE); l = readline(input_line,dlc); //Omit the final newline int flag = 0; //If a line is empty the file is not necessarily finished. //If the last entry has no newline, we should not skip this entry struct dela_entry* DELAS_entry; int semitic = 0; int current_line=0; while (l != EOF) { current_line++; DELAS_entry = is_strict_DELAS_line(input_line->str, alph); if (DELAS_entry != NULL) { /* If we have a strict DELAS line, that is to say, one with * a simple word */ if (error_check_status==ONLY_COMPOUND_WORDS) { error("Unexpected simple word forbidden by -c:\n%S\n",input_line); free_dela_entry(DELAS_entry); goto next_line; } SU_forms_T forms; SU_init_forms(&forms); //Allocate the space for forms and initialize it to null values char inflection_code[1024]; unichar code_gramm[1024]; /* We take the first grammatical code, and we extract from it the name * of the inflection transducer to use */ get_inflection_code(DELAS_entry->semantic_codes[0], inflection_code, code_gramm, &semitic); /* And we inflect the word */ // Fix bug#8 - "Inflection with Semitic Mode is not working anymore" p_multiFlex_ctx->semitic = semitic; // err=SU_inflect(DELAS_entry->lemma,inflection_code,&forms,semitic); if (DELAS_entry->n_filter_codes != 0) { p_multiFlex_ctx->n_filter_codes = DELAS_entry->n_filter_codes; p_multiFlex_ctx->filter_polarity = DELAS_entry->filter_polarity; p_multiFlex_ctx->filter_codes = DELAS_entry->filter_codes; err = SU_inflect(p_multiFlex_ctx,DELAS_entry->lemma, inflection_code,&forms); p_multiFlex_ctx->n_filter_codes=0; } else err = SU_inflect(p_multiFlex_ctx,DELAS_entry->lemma, inflection_code,&forms); #ifdef REMINDER_WARNING #ifdef __GNUC__ #warning mettre toutes les entrees sur une meme ligne #elif ((defined(__VISUALC__)) || defined(_MSC_VER)) #pragma message("warning : mettre toutes les entrees sur une meme ligne") #endif #endif /* Then, we print its inflected forms to the output */ for (int i = 0; i < forms.no_forms; i++) { unichar foo[1024]; if (p_multiFlex_ctx->korean!=NULL) { Hanguls_to_Jamos(forms.forms[i].form,foo,p_multiFlex_ctx->korean,1); } else { u_strcpy(foo,forms.forms[i].form); } u_fprintf(dlcf, "%S,%S.%S", foo/*forms.forms[i].form*/, DELAS_entry->lemma, code_gramm); /* We add the semantic codes, if any */ for (int j = 1; j < DELAS_entry->n_semantic_codes; j++) { u_fprintf(dlcf, "+%S", DELAS_entry->semantic_codes[j]); } if (forms.forms[i].local_semantic_code != NULL) { u_fprintf(dlcf, "%S", forms.forms[i].local_semantic_code); } if (forms.forms[i].raw_features != NULL && forms.forms[i].raw_features[0] != '\0') { u_fprintf(dlcf, ":%S", forms.forms[i].raw_features); } u_fprintf(dlcf, "\n"); } SU_delete_inflection(&forms); free_dela_entry(DELAS_entry); /* End of simple word case */ } else { u_fprintf(U_STDERR,"we no have a strict DELAS line\n"); /* If we have not a simple word DELAS line, we try to analyse it * as a compound word DELAC line */ if (error_check_status==ONLY_SIMPLE_WORDS) { error("Unexpected compound word forbidden by -s:\n%S\n",input_line); goto next_line; } if (p_multiFlex_ctx->config_files_status != CONFIG_FILES_ERROR) { /* If this is a compound word, we process it if and only if the * configuration files have been correctly loaded */ dlc_entry = (DLC_entry_T*) malloc(sizeof(DLC_entry_T)); if (!dlc_entry) { fatal_alloc_error("inflect"); } /* Convert a DELAC entry into the internal multi-word format */ err = DLC_line2entry(alph,p_multiFlex_ctx->pL_MORPHO,input_line->str, dlc_entry, &(p_multiFlex_ctx->D_CLASS_EQUIV)); if (!err) { //Inflect the entry MU_init_forms(&MU_forms); err = MU_inflect(p_multiFlex_ctx,dlc_entry->lemma,&MU_forms); if (!err) { int f; //index of the current inflected form //Inform the user if no form generated if (MU_forms.no_forms == 0) { error("No inflected form could be generated for "); DLC_print_entry(U_STDERR,p_multiFlex_ctx->pL_MORPHO,dlc_entry); } //Print inflected forms for (f = 0; f < MU_forms.no_forms; f++) { //Format the inflected form to the DELACF format err = DLC_format_form(p_multiFlex_ctx->pL_MORPHO,output_line, DIC_LINE_SIZE - 1, MU_forms.forms[f], dlc_entry, &(p_multiFlex_ctx->D_CLASS_EQUIV)); if (!err) { //Print one inflected form at a time to the DELACF file u_fprintf(dlcf, "%S\n", output_line); } } } MU_delete_inflection(&MU_forms); DLC_delete_entry(dlc_entry); } } else { /* We try to inflect a compound word whereas the "Morphology.txt" and/or * "Equivalences.txt" file(s) has/have not been loaded */ if (!flag) { /* We use a flag to print the error message only once */ error( "WARNING: Compound words won't be inflected because configuration files\n"); error(" have not been correctly loaded.\n"); flag = 1; } } } next_line: //Get next entry l = readline(input_line,dlc); if (l!=EOF) { if (input_line->str[0]=='\0') { /* If we find an empty line, then we go on */ goto next_line; } } } free_Ustring(input_line); u_fclose(dlc); u_fclose(dlcf); return 0; }
/** * Loads and returns an automaton from the given .fst2. * Returns NULL if there is no more automaton to load. */ Fst2Automaton* load_automaton(Elag_fst_file_in* fstf) { if (fstf->pos>=fstf->nb_automata) { return NULL; } Ustring* ustr=new_Ustring(); readline(ustr,fstf->f); const unichar* p=ustr->str; if (p[0]!='-') { fatal_error("load_automaton: %s: bad file format\n",fstf->name); } p++; int i=u_parse_int(p,&p); if (i!=fstf->pos+1) { /* We make sure that the automaton number is what it should be */ fatal_error("load_automaton: %s: parsing error with line '%S' ('-%d ...' expected)\n",fstf->name,ustr->str,fstf->pos+1); } /* Now p points on the automaton name */ p++; Fst2Automaton* A=new_Fst2Automaton(p); while (readline(ustr,fstf->f) && ustr->str[0]!='f') { /* If there is a state to read */ p=ustr->str; SingleGraphState state=add_state(A->automaton); if (*p=='t') { /* If necessary, we set the state final */ set_final_state(state); } /* We puts p on the first digit */ while (*p!='\0' && !u_is_digit(*p)) { p++; } while (*p!='\0') { /* If there is a transition to read */ int tag_number=u_parse_int(p,&p); if (fstf->renumber!=NULL) { tag_number=fstf->renumber[tag_number]; } while (*p==' ') { p++; } if (!u_is_digit(*p)) { fatal_error("load_automaton: %s: bad file format (line='%S')\n",fstf->name,ustr->str); } int state_number=u_parse_int(p,&p); symbol_t* tmp=(symbol_t*)fstf->symbols->value[tag_number]; if (tmp!=NULL) { /* If it is a good symbol (successfully loaded), we add transition(s) */ if (fstf->type!=FST_TEXT) { add_all_outgoing_transitions(state,tmp,state_number); } else { /* In a text automaton, we add one transition per element of * the symbol list. For instance, if we have: * * tmp = "{domestique,.N:fs}" => "{domestique,.N:ms}" => NULL * * then we add two transitions. */ add_all_outgoing_transitions(state,tmp,state_number); } } while (*p==' ') { p++; } } } if (*ustr->str=='\0') { fatal_error("load_automaton: unexpected end of file\n"); } if (A->automaton->number_of_states==0) { error("load_automaton: automaton with no state\n"); } else { set_initial_state(A->automaton->states[0]); } fstf->pos++; free_Ustring(ustr); return A; }
/** * Saves the given automaton into the given .fst2 file. */ void fst_file_write(Elag_fst_file_out* fstf,const Fst2Automaton* A) { Ustring* tag=new_Ustring(); void (*symbol_to_tag)(const symbol_t*,Ustring*)=NULL; switch (fstf->type) { case FST_TEXT: symbol_to_tag=symbol_to_text_label; break; case FST_GRAMMAR: symbol_to_tag=symbol_to_grammar_label; break; case FST_LOCATE: symbol_to_tag=symbol_to_locate_label; break; default: fatal_error("fst_file_write: invalid fstf->type: %d\n",fstf->type); } /* We save the graph number and name */ u_fprintf(fstf->f,"-%d %S\n",fstf->nb_automata+1,A->name); int index; unichar deflabel[]={'<','d','e','f','>',0}; for (int q=0;q<A->automaton->number_of_states;q++) { SingleGraphState state=A->automaton->states[q]; u_fprintf(fstf->f,"%C ",is_final_state(state)?'t':':'); for (Transition* t=state->outgoing_transitions;t!=NULL;t=t->next) { if (t->tag_number==-1) { /* If we are in the case of an "EMPTY" transition created because * the automaton was emptied as trim time */ u_strcpy(tag,"EMPTY"); } else { symbol_t* symbol=t->label; symbol_to_tag(symbol,tag); } if (fstf->type==FST_LOCATE) { /* If we are saving a Locate .fst2, we have to perform * some special things */ if (u_strcmp(tag->str, "<PNC>") == 0) { PNC_trans_write(fstf, t->state_number); } else if (u_strcmp(tag->str, "<CHFA>") == 0 || u_strcmp(tag->str, "<NB>") == 0) { CHFA_trans_write(fstf, t->state_number); } else if (u_strcmp(tag->str, "<.>") == 0) { LEXIC_trans_write(fstf, t->state_number); } else { goto normal_output; } } else { /* If we have a normal transition to print */ normal_output: index=get_value_index(tag->str,fstf->labels); u_fprintf(fstf->f,"%d %d ",index,t->state_number); } } if (state->default_state!=-1) { if (fstf->type!=FST_GRAMMAR) { error("Unexpected <def> label in text/locate automaton\n"); } index=get_value_index(deflabel,fstf->labels); u_fprintf(fstf->f,"%d %d ",index,state->default_state); } u_fputc('\n',fstf->f); } u_fprintf(fstf->f,"f \n"); free_Ustring(tag); fstf->nb_automata++; }
/** * Explores all the partial matches to produce outputs in MERGE or REPLACE mode. * * If *var_starts!=NULL, it means that there are pending $var_start( tags * that wait for being taken into account when a text dependent tag is found. */ void explore_match_for_MERGE_or_REPLACE_mode(struct locate_tfst_infos* infos, struct tfst_simple_match_list* element, vector_ptr* items,int current_item,Ustring* s, int last_text_dependent_tfst_tag, struct list_pointer* *var_starts) { if (current_item==items->nbelems) { /* If we have finished, we can save the current output */ element->output=s->str; infos->matches=add_element_to_list(infos,infos->matches,element); element->output=NULL; return; } /* We save the length because it will be modified */ int len=s->len; struct tfst_match* item=(struct tfst_match*)(items->tab[current_item]); if (item==NULL) { fatal_error("Unexpected NULL item in explore_match_for_MERGE_mode\n"); } if (item->debug_output!=NULL) { /* If we have a debug output, we deal it */ u_strcat(s,item->debug_output); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_text_dependent_tfst_tag,var_starts); s->len=len; s->str[len]='\0'; return; } unichar* output=infos->fst2->tags[item->fst2_transition->tag_number]->output; unichar name[MAX_TRANSDUCTION_VAR_LENGTH]; int capture; struct dela_entry* old_value_dela=NULL; capture=is_capture_variable(output,name); if (capture) { /* If we have a capture variable $:X$, we must save the previous value * for this dictionary variable */ old_value_dela=clone_dela_entry(get_dic_variable(name,infos->dic_variables)); } Match saved_element=element->m; struct list_int* text_tags=item->text_tag_numbers; int captured_chars=0; /* We explore all the text tags */ while (text_tags!=NULL) { /* First, we restore the output string */ s->len=len; s->str[len]='\0'; captured_chars=0; /* We deal with the fst2 tag output, if any */ if (item->first_time) { /* We only have to process the output only once, * since it will have the same effect on all tfst tags. * * Example: the fst2 tag "cybercrime/ZZ" may match the two tfst tags "cyber" and * "crime", but we must process the "ZZ" output only before the first tfst tag "cyber" */ if (capture) { /* If we have a capture variable, then we have to check whether the tfst tag * is a tagged token or not */ int tfst_tag_number=text_tags->n; int fst2_tag_number=item->fst2_transition->tag_number; if (!do_variable_capture(tfst_tag_number,fst2_tag_number,infos,name)) { goto restore_dic_variable; } } else if (!deal_with_output_tfst(s,output,infos,&captured_chars)) { /* We do not take into account matches with variable errors if the * process_output_for_tfst_match function has decided that backtracking * was necessary, either because of a variable error of because of a * $a.SET$ or $a.UNSET$ test */ goto restore_dic_variable; } } int last_tag=last_text_dependent_tfst_tag; TfstTag* current_tag=NULL; if (text_tags->n==-1) { /* We have a text independent match */ Fst2Tag fst2_tag=infos->fst2->tags[item->fst2_transition->tag_number]; if (fst2_tag->type==BEGIN_OUTPUT_VAR_TAG) { /* If we an output variable start $|a( */ int var_index=get_value_index(fst2_tag->variable,infos->output_variables->variable_index); Ustring* old_value = new_Ustring(); swap_output_variable_content(infos->output_variables, var_index, old_value); // now old_value contain the backup set_output_variable_pending(infos->output_variables,fst2_tag->variable); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); unset_output_variable_pending(infos->output_variables,fst2_tag->variable); // restore the good content from backup swap_output_variable_content(infos->output_variables, var_index, old_value); free_Ustring(old_value); goto restore_dic_variable; } else if (fst2_tag->type==END_OUTPUT_VAR_TAG) { /* If we an output variable end $|a) */ unset_output_variable_pending(infos->output_variables,fst2_tag->variable); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); set_output_variable_pending(infos->output_variables,fst2_tag->variable); goto restore_dic_variable; } else if (fst2_tag->type==BEGIN_VAR_TAG) { /* If we have a variable start tag $a(, we add it to our * variable tag list */ struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable); int old_value=v->start_in_tokens; /* We add the address of the start field to our list */ (*var_starts)=new_list_pointer(&(v->start_in_tokens),(var_starts==NULL)?NULL:(*var_starts)); /* Then, we go on the next item */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); /* After the exploration, there are 2 cases: * 1) *var_starts is NULL: nothing to do * 2) *var_starts is not NULL: we reached the end of the items without findind any * text dependent match, so we can free the list */ free_list_pointer(*var_starts); (*var_starts)=NULL; v->start_in_tokens=old_value; /* If we have a $a( tag, we know that we can only have just one text tag * with special value -1 */ goto restore_dic_variable; } else if (fst2_tag->type==END_VAR_TAG) { /* If we have found a $a) tag */ if (last_tag==-1) { /* If we have no tfst tag to use, then it's a variable definition error, * and we have nothing special to do */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); goto restore_dic_variable; } else { /* We can set the end of the variable, it's 'last_tag' */ struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable); int old_value=v->end_in_tokens; v->end_in_tokens=last_tag; explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); v->end_in_tokens=old_value; goto restore_dic_variable; } } else if (fst2_tag->type==LEFT_CONTEXT_TAG) { /* If we have found a $* tag, we must reset the stack string and the * start position, so we save them */ unichar* old_stack=u_strdup(s->str); int old_pos_token=element->m.start_pos_in_token; int old_pos_char=element->m.start_pos_in_char; int old_pos_letter=element->m.start_pos_in_letter; /* We set the new values */ empty(s); element->m.start_pos_in_token=LEFT_CONTEXT_PENDING; /* We must reset last_tag to -1, because is not, we will have an * extra space on the left of the match */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,-1,var_starts); /* And we restore previous values */ element->m.start_pos_in_token=old_pos_token; element->m.start_pos_in_char=old_pos_char; element->m.start_pos_in_letter=old_pos_letter; u_strcpy(s,old_stack); free(old_stack); /* If we have a $* tag, we know that we can only have just one text tag * with special value -1 */ goto restore_dic_variable; } else if (fst2_tag->type==BEGIN_POSITIVE_CONTEXT_TAG) { fatal_error("problem $[\n"); } } else { current_tag=(TfstTag*)(infos->tfst->tags->tab[text_tags->n]); /* We update the last tag */ last_tag=text_tags->n; /* If the current text tag is not a text independent one */ /* If there are some pending $a( tags, we set them to the current tag */ if (var_starts!=NULL) { struct list_pointer* ptr=(*var_starts); while (ptr!=NULL) { int* start=(int*)(ptr->pointer); (*start)=text_tags->n; ptr=ptr->next; } } int previous_start_token,previous_start_char; if (last_text_dependent_tfst_tag!=-1) { /* If the item is not the first, we must insert the original text that is * between the end of the previous merged text and the beginning of the * current one, typically to insert spaces */ TfstTag* previous_tag=(TfstTag*)(infos->tfst->tags->tab[last_text_dependent_tfst_tag]); previous_start_token=previous_tag->m.end_pos_in_token; previous_start_char=previous_tag->m.end_pos_in_char; /* We start just after the end of the previous match */ if (infos->tfst->token_content[previous_start_token][previous_start_char+1]!='\0') { /* If we were not at the end of the previous text token, we just inscrease * the char position */ previous_start_char++; } else { /* Otherwise, we go on the next token */ previous_start_token++; previous_start_char=0; } } else { /* Otherwise, we start on the beginning of the current text tag */ //error("current item=%d\n",text_tags->n); previous_start_token=current_tag->m.start_pos_in_token; previous_start_char=current_tag->m.start_pos_in_char; } /* Here we have to insert the text that is between current_start and current_end, * and then, the ouput of the fst2 transition */ if (infos->output_policy==MERGE_OUTPUTS) { insert_text_interval_tfst(infos,s,previous_start_token,previous_start_char, current_tag->m.end_pos_in_token,current_tag->m.end_pos_in_char); } } /* Then, we go on the next item */ struct list_pointer* ptr2=NULL; if (element->m.start_pos_in_token==LEFT_CONTEXT_PENDING && current_tag!=NULL) { element->m.start_pos_in_token=infos->tfst->offset_in_tokens+current_tag->m.start_pos_in_token; element->m.start_pos_in_char=current_tag->m.start_pos_in_char; element->m.start_pos_in_letter=current_tag->m.start_pos_in_letter; } explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag ,&ptr2 /* We have encountered a text dependent tag, so there is no * more pending start tag like $a( */ ); element->m=saved_element; /* If there was a $* tag pending */ free_list_pointer(ptr2); if (infos->ambiguous_output_policy==IGNORE_AMBIGUOUS_OUTPUTS) { /* If we don't want ambiguous outputs, then the first path is * enough for our purpose */ goto restore_dic_variable; } text_tags=text_tags->next; remove_chars_from_output_variables(infos->output_variables,captured_chars); /* We reset to 0, because if we exit the while normally, we don't want to * modify output variables twice when reaching the 'restore_dic_variable' * label */ captured_chars=0; } restore_dic_variable: /* We redo this about output variables here, since we may have jumped here directly */ remove_chars_from_output_variables(infos->output_variables,captured_chars); if (capture) { /* If we have a capture variable $:X$, we must restore the previous value * for this dictionary variable */ set_dic_variable(name,old_value_dela,&(infos->dic_variables),0); } }