/** * Allocates, initializes and returns a compound word tree node. */ struct DLC_tree_node* new_DLC_tree_node() { struct DLC_tree_node* n; n=(struct DLC_tree_node*)malloc(sizeof(struct DLC_tree_node)); if (n==NULL) { fatal_alloc_error("new_DLC_tree_node"); } n->patterns=NULL; n->number_of_patterns=0; n->count_reference=1; n->array_of_patterns=NULL; n->transitions=NULL; n->number_of_transitions=0; n->destination_tokens=NULL; n->destination_nodes=NULL; return n; }
/** * This function clones the given element. Note that we increase the 'pointed_by' field * of the tfst_match. */ struct tfst_simple_match_list* new_tfst_simple_match_list(struct tfst_simple_match_list* e, struct tfst_simple_match_list* next) { struct tfst_simple_match_list* m=(struct tfst_simple_match_list*)malloc(sizeof(struct tfst_simple_match_list)); if (m==NULL) { fatal_alloc_error("new_tfst_simple_match_list"); } memcpy(m,e,sizeof(struct tfst_simple_match_list)); if (m->output!=NULL) { /* If there was an output, we have to clone it */ m->output=u_strdup(m->output); } if (m->match!=NULL) { (m->match->pointed_by)++; } m->next=next; return m; }
struct composition_rule* new_composition_rule () { struct composition_rule* tmp = (struct composition_rule*)malloc(sizeof(struct composition_rule)); if (tmp==NULL) { fatal_alloc_error("new_composition_rule"); } tmp->before[0].string[0] = '\0'; tmp->after[0].string[0] = '\0'; tmp->then.add[0] = '\0'; tmp->then.del[0] = '\0'; tmp->then.repl[0] = '\0'; tmp->then.substr_act[0] = '\0'; tmp->then.substr_next[0] = '\0'; tmp->then.undo_substr_act[0] = '\0'; tmp->then.undo_substr_next[0] = '\0'; return tmp; }
/** * Adds the given DELA entry to the given tree. If the entry is already * present in the tree, then it is freed. Otherwise, it is put in the tree * so that IT MUST NOT BE FREED! */ void add_entry(struct DELA_tree* tree,struct dela_entry* entry) { int n=get_value_index(entry->inflected,tree->inflected_forms); if (n==tree->size) { /* If there was no entry list for the given inflected form */ if (n==tree->capacity) { /* If we must double the array capacity */ tree->capacity=2*tree->capacity; tree->dela_entries=(struct dela_entry_list**)realloc(tree->dela_entries,tree->capacity*sizeof(struct dela_entry_list*)); if (tree->dela_entries==NULL) { fatal_alloc_error("add_entry"); } } tree->dela_entries[n]=NULL; (tree->size)++; } tree->dela_entries[n]=insert_if_not_present(entry,tree->dela_entries[n]); }
/** * Adds a value in the value array without associating it with a unicode string. * Returns the index of this value. */ int add_value(void* value,struct string_hash_ptr* hash) { if (hash->capacity==DONT_USE_VALUES) { fatal_error("Value array doesn't exist in add_value\n"); } (hash->hash->size)++; hash->size=hash->hash->size; int index=hash->size; if (hash->hash->size==hash->capacity) { /* We enlarge the 'value' array, doubling its capacity */ hash->capacity=2*hash->capacity; hash->value=(void**)realloc(hash->value,sizeof(void*)*hash->capacity); if (hash->value==NULL) { fatal_alloc_error("add_value"); } } hash->value[index]=value; return index; }
/** * This function concatenates B at the end of A. A is modified. */ void elag_concat(language_t* language,SingleGraph A,SingleGraph B) { int oldnb=A->number_of_states; int* renumber=(int*)malloc(B->number_of_states*sizeof(int)); if (renumber==NULL) { fatal_alloc_error("elag_concat"); } int q; /* We copy the states of B into A */ for (q=0;q<B->number_of_states;q++) { renumber[q]=A->number_of_states; add_state(A); } for (q=0;q<B->number_of_states;q++) { A->states[renumber[q]]->outgoing_transitions=clone_transition_list(B->states[q]->outgoing_transitions,renumber,dup_symbol); A->states[renumber[q]]->default_state=(B->states[q]->default_state!=-1)?renumber[B->states[q]->default_state]:-1; if (is_final_state(B->states[q])) { set_final_state(A->states[renumber[q]]); } } /* Then, we concatenate A and B. * 1) We replace default transitions that outgo from B's initial states * by explicit transitions */ struct list_int* initials=get_initial_states(B); for (struct list_int* tmp=initials;tmp!=NULL;tmp=tmp->next) { explicit_default_transition(language,A,renumber[tmp->n]); } for (q=0;q<oldnb;q++) { if (is_final_state(A->states[q])) { /* Each final state of A becomes non final. Moreover, we have * to explicit its default transition, because if not, the concatenation * algorithm will modify the recognized language. */ unset_final_state(A->states[q]); explicit_default_transition(language,A,q); for (struct list_int* tmp=initials;tmp!=NULL;tmp=tmp->next) { concat(&(A->states[q]->outgoing_transitions),clone_transition_list(A->states[renumber[tmp->n]]->outgoing_transitions,NULL,dup_symbol)); if (is_final_state(A->states[renumber[tmp->n]])) { set_final_state(A->states[q]); } } } } free(renumber); free_list_int(initials); }
static int get_value_index_for_string_colon_string(const unichar* str1,const unichar* str2,struct string_hash* hash) { int value; unichar*allocated_buffer = NULL; unichar tmp_default[DEFAULT_TMP_GET_VALUE_INDEX_BUFFER_SIZE]; unichar*tmp=tmp_default; int nb_unichar_buffer=u_strlen(str1)+u_strlen(str2)+2; if (nb_unichar_buffer>DEFAULT_TMP_GET_VALUE_INDEX_BUFFER_SIZE) { tmp=allocated_buffer=(unichar*)malloc(sizeof(unichar*)*nb_unichar_buffer); if (allocated_buffer==NULL) { fatal_alloc_error("get_value_index_for_string_colon_string"); } } u_sprintf(tmp,"%S,%S",str1,str2); value=get_value_index(tmp,hash); if (allocated_buffer != NULL) { free(allocated_buffer); } return value; }
/** * For each color, a state of this color is chosen to represent the color. * The chosen number is >= its color number. */ int* choose_states(int* color,int nbColors,int nbStates) { int* chosen=(int*)malloc(nbColors*sizeof(int)); if (chosen==NULL) { fatal_alloc_error("choose_states"); } for (int c=0;c<nbColors;c++) { bool found=false; for (int s=c;!found && s<nbStates;s++) { if (color[s]==c) { chosen[c]=s; found=true; } } if (!found) { fatal_error("choose_states: color %d not found!\n",c); } } return chosen; }
/** * Inserts the graph number 'n' in the given condition list. */ void insert_graph_in_conditions(int n,ConditionList* l) { ConditionList tmp; if (*l==NULL) { /* If the condition list is empty, we create one */ tmp=(ConditionList)malloc(sizeof(struct condition_list)); if (tmp==NULL) { fatal_alloc_error("insert_graph_in_conditions"); } tmp->next=NULL; tmp->condition=new_list_int(n); *l=tmp; return; } /* Otherwise, we insert the graph number in all the conditions of the list */ tmp=*l; while (tmp!=NULL) { tmp->condition=sorted_insert(n,tmp->condition); tmp=tmp->next; } }
/** * Allocates, initializes and returns an integer array that contains * the elements of the given list. '*size' is set to the size of this * array. Note that passing an empty list will return NULL. */ int* dump(struct list_int* list,int *size,Abstract_allocator prv_alloc) { *size=0; if (list==NULL) return NULL; struct list_int* tmp=list; /* We count the number of elements */ while (tmp!=NULL) { (*size)++; tmp=tmp->next; } int* result=(int*)malloc_cb((*size)*sizeof(int),prv_alloc); if (result==NULL) { fatal_alloc_error("dump"); } tmp=list; for (int i=0;i<(*size);i++) { result[i]=tmp->n; tmp=tmp->next; } return result; }
/** * Allocates, initializes and returns a new tfst_match. */ struct tfst_match* new_tfst_match(int source_state_text, int dest_state_text, Transition* fst2_transition, int pos_kr, int text_tag_number, int first_time) { struct tfst_match* match=(struct tfst_match*)malloc(sizeof(struct tfst_match)); if (match==NULL) { fatal_alloc_error("new_tfst_match"); } match->source_state_text=source_state_text; match->dest_state_text=dest_state_text; match->fst2_transition=fst2_transition; match->pos_kr=pos_kr; match->text_tag_numbers=sorted_insert(text_tag_number,NULL); match->next=NULL; match->pointed_by=0; match->first_time=first_time; return match; }
/** * This function adds the given token to the given token tree, if not already * present. Then, it adds the given transition to its transition list. */ void add_tag(unichar* token,int tag_number,int dest_state,struct fst2txt_token_tree* tree, Abstract_allocator prv_alloc) { int n=get_value_index(token,tree->hash); if (n==tree->size) { /* If we have to create a new transition list because the token was not already in * the tree. */ if (tree->size==tree->capacity) { /* If necessary, we double the size of the transition array */ tree->capacity=2*tree->capacity; tree->transition_array=(Transition**)realloc_cb(tree->transition_array,(tree->capacity/2)*sizeof(Transition*),tree->capacity*sizeof(Transition*),prv_alloc); if (tree->transition_array==NULL) { fatal_alloc_error("add_tag"); } } (tree->size)++; /* We don't forget to initialize the new transition list */ tree->transition_array[n]=NULL; } /* We add the new transition, assuming that it is not already in the list, becauses * it would mean that the fst2 is not deterministic. */ tree->transition_array[n]=new_Transition(tag_number,dest_state,tree->transition_array[n],prv_alloc); }
/** * Allocates, initializes and returns an array that associates * a color (0 or 1) to each state of 'A', making sure that the * state #0 will be colored with 0. '*nbColors' will be set to * the number of colors that have been used (1 if all states * have the same finality; 2 otherwise). */ int* init_colors(SingleGraph A,int *nbColors) { int* color=(int*)calloc(A->number_of_states,sizeof(int)); if (color==NULL) { fatal_alloc_error("init_colors"); } /* bicolor will indicate if all states are of the same color (finality) or * not */ bool bicolor=false; if (is_final_state(A->states[0])) { /* We distinguish two cases (initial state final or not), just * to ensure that the color of the initial state #0 will be 0 */ for (int e=0;e<A->number_of_states;e++) { color[e]=is_final_state(A->states[e])?0:(bicolor=true,1); } } else { for (int e=0;e<A->number_of_states;e++) { color[e]=is_final_state(A->states[e])?(bicolor=true,1):0; } } (*nbColors)=(bicolor?2:1); return color; }
/** * This function takes a fst2 and returns an array containing the corresponding * optimized states. */ OptimizedFst2State* build_optimized_fst2_states(Variables* v,OutputVariables* output,Fst2* fst2, Abstract_allocator prv_alloc) { OptimizedFst2State* optimized_states=(OptimizedFst2State*)malloc_cb(fst2->number_of_states*sizeof(OptimizedFst2State),prv_alloc); if (optimized_states==NULL) { fatal_alloc_error("build_optimized_fst2_states"); } int num_current_graph=1; int pos_in_current_graph=0; for (int i=0;i<fst2->number_of_states;i++) { optimized_states[i]=optimize_state(v,output,fst2,fst2->states[i],fst2->tags,prv_alloc); optimized_states[i]->graph_number=num_current_graph; optimized_states[i]->pos_transition_in_fst2=i; optimized_states[i]->pos_transition_in_graph=pos_in_current_graph++; if (pos_in_current_graph >= *((fst2->number_of_states_per_graphs)+num_current_graph)) { num_current_graph++; pos_in_current_graph=0; } } #ifdef AGGRESSIVE_OPTIMIZATION int n_graphs_emptied; do { n_graphs_emptied=0; for (int i=1;i<=fst2->number_of_graphs;i++) { n_graphs_emptied+=remove_useless_lexical_transitions(fst2,i,optimized_states,prv_alloc); } } while (n_graphs_emptied!=0); /* Finally, we convert token lists to sorted array suitable for binary search */ for (int i=0;i<fst2->number_of_states;i++) { token_list_2_token_array(optimized_states[i],prv_alloc); } #endif // AGGRESSIVE_OPTIMIZATION return optimized_states; }
/** * Takes a string containing .bin names separated with semi-colons and * loads the corresponding dictionaries. */ void load_morphological_dictionaries(const char* morpho_dic_list,struct locate_parameters* p) { if (morpho_dic_list==NULL || morpho_dic_list[0]=='\0') { return; } p->n_morpho_dics=1+count_semi_colons(morpho_dic_list); p->morpho_dic_bin=(const unsigned char**)malloc(p->n_morpho_dics*sizeof(const unsigned char*)); p->morpho_dic_bin_free=(struct BIN_free_info*)malloc(p->n_morpho_dics*sizeof(struct BIN_free_info)); p->morpho_dic_inf=(const struct INF_codes**)malloc(p->n_morpho_dics*sizeof(struct INF_codes*)); p->morpho_dic_inf_free=(struct INF_free_info*)malloc(p->n_morpho_dics*sizeof(struct INF_free_info)); if (p->morpho_dic_bin==NULL || p->morpho_dic_inf==NULL || p->morpho_dic_bin_free==NULL || p->morpho_dic_inf_free==NULL) { fatal_alloc_error("load_morphological_dictionaries"); } char bin[FILENAME_MAX]; int pos; for (int i=0; i<p->n_morpho_dics; i++) { pos=0; while (*morpho_dic_list!='\0' && *morpho_dic_list!=';') { bin[pos++]=*morpho_dic_list; morpho_dic_list++; } bin[pos]='\0'; if (*morpho_dic_list==';') { morpho_dic_list++; } p->morpho_dic_bin[i]=load_abstract_BIN_file(bin,&(p->morpho_dic_bin_free[i])); p->morpho_dic_inf[i]=NULL; if (p->morpho_dic_bin[i]!=NULL) { char inf[FILENAME_MAX]; remove_extension(bin,inf); strcat(inf,".inf"); p->morpho_dic_inf[i]=load_abstract_INF_file(inf,&(p->morpho_dic_inf_free[i])); if (p->morpho_dic_inf[i]==NULL) { free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i])); p->morpho_dic_bin[i]=NULL; } } } }
/** * Takes a string containing .bin names separated with semi-colons and * loads the corresponding dictionaries. */ void load_morphological_dictionaries(const char* morpho_dic_list,struct locate_parameters* p, const char* local_morpho_dic) { if (fexists(local_morpho_dic)) { if (morpho_dic_list!=NULL && morpho_dic_list[0]!='\0') { /* If we have both local and non-local dictionaries */ char* temp; /* +2 because we have a ';' to insert */ temp=(char*)malloc(strlen(local_morpho_dic)+strlen(morpho_dic_list)+2); if (temp==NULL) { fatal_alloc_error("load_morphological_dictionaries"); } sprintf(temp,"%s;%s",local_morpho_dic,morpho_dic_list); load_morphological_dictionaries(temp,p); free(temp); return; } else { /* We just have the local one */ return load_morphological_dictionaries(local_morpho_dic,p); } } /* We have no local dictionary*/ load_morphological_dictionaries(morpho_dic_list,p); }
/** * Allocates, initializes and returns a new match list element. */ struct match_list* new_match(int start,int end,int start_char,int end_char, int start_letter,int end_letter,unichar* output, int weight,struct match_list* next,Abstract_allocator prv_alloc) { struct match_list *l; l=(struct match_list*)malloc_cb(sizeof(struct match_list),prv_alloc); if (l==NULL) { fatal_alloc_error("new_match"); } l->m.start_pos_in_token=start; l->m.end_pos_in_token=end; l->weight=weight; if (output==NULL) { l->output=NULL; } else { l->output=u_strdup(output,prv_alloc); } l->m.start_pos_in_char=start_char; l->m.end_pos_in_char=end_char; l->m.start_pos_in_letter=start_letter; l->m.end_pos_in_letter=end_letter; l->next=next; return l; }
/** * Resizes the token array of the given block so that the token array can * contain 'new_number_of_elements' elements. The function doubles the size * of the array as many times as needed. If the array has already a sufficient * capacity, the function does nothing. */ void realloc_tct_hash_block(struct tct_hash_block* block,int new_number_of_elements, int token_array_base_memory_nb_item_size) { if (block->size >=new_number_of_elements) return; int factor=2; while (block->size*factor < new_number_of_elements) { factor*=2; } if ((block->size) == token_array_base_memory_nb_item_size) { if (factor < 4) factor=4; int* new_array=(int*)malloc(block->size*sizeof(int)*factor); for (int i=0;i<block->length;i++) { new_array[i]=block->token_array[i]; } block->token_array=new_array; } else block->token_array=(int*)realloc(block->token_array,block->size*sizeof(int)*factor); block->size*=factor; if (block->token_array==NULL) { fatal_alloc_error("realloc_tct_hash_block"); } }
/** * Returns the index value associated to the given key. 'value' will be associated to * the given key if the key is not already present in the string_hash_ptr. */ int get_value_index(const unichar* key,struct string_hash_ptr* hash,int insert_policy,void* value) { int size=hash->hash->size; int index=get_value_index_(key,0,hash->hash->root,hash->hash,insert_policy,NULL); if (index==-1) { /* If the key was neither found nor inserted, we return -1 */ return -1; } if (hash->hash->size!=size) { hash->size=hash->hash->size; /* If the key was inserted, we add the corresponding value into the 'value' array */ /* Otherwise: if there is a maximum capacity */ if (hash->hash->size==hash->capacity) { /* We enlarge the 'value' array, doubling its capacity */ hash->capacity=2*hash->capacity; hash->value=(void**)realloc(hash->value,sizeof(void*)*hash->capacity); if (hash->value==NULL) { fatal_alloc_error("get_value_index\n"); } } hash->value[index]=value; } return index; }
int main_CheckDic(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int is_a_DELAF=-1; int strict_unprotected=0; int skip_path=0; char alph[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; int space_warnings=1; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_CheckDic,lopts_CheckDic,&index,vars))) { switch(val) { case 'f': is_a_DELAF=1; break; case 's': is_a_DELAF=0; break; case 'h': usage(); return 0; case 'r': strict_unprotected=1; break; case 't': strict_unprotected=0; break; case 'n': space_warnings=0; break; case 'p': skip_path=1; break; case 'a': if (vars->optarg[0]=='\0') { fatal_error("Empty alphabet argument\n"); } strcpy(alph,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_CheckDic[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (is_a_DELAF==-1 || vars->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return 1; } U_FILE* dic=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (dic==NULL) { fatal_error("Cannot open dictionary %s\n",argv[vars->optind]); } Alphabet* alphabet0=NULL; if (alph[0]!='\0') { alphabet0=load_alphabet(alph,1); } char output_filename[FILENAME_MAX]; get_path(argv[vars->optind],output_filename); strcat(output_filename,"CHECK_DIC.TXT"); U_FILE* out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output_filename,U_WRITE); if (out==NULL) { u_fclose(dic); fatal_error("Cannot create %s\n",output_filename); } u_printf("Checking %s...\n",argv[vars->optind]); unichar line[CHECKDIC_LINE_SIZE]; int line_number=1; /* * We declare and initialize an array in order to know which * letters are used in the dictionary. */ int i; char* alphabet=(char*)malloc(sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS); if (alphabet==NULL) { fatal_alloc_error("CheckDic's main"); } memset(alphabet,0,sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS); /* * We use two structures for the storage of the codes found in the * dictionary. Note that 'semantic_codes' is used to store both grammatical and * semantic codes. */ struct string_hash* semantic_codes=new_string_hash(); struct string_hash* inflectional_codes=new_string_hash(); struct string_hash* simple_lemmas=new_string_hash(DONT_USE_VALUES); struct string_hash* compound_lemmas=new_string_hash(DONT_USE_VALUES); int n_simple_entries=0; int n_compound_entries=0; /* * We read all the lines and check them. */ while (EOF!=u_fgets_limit2(line,DIC_LINE_SIZE,dic)) { if (line[0]=='\0') { /* If we have an empty line, we print a unicode error message * into the output file */ u_fprintf(out,"Line %d: empty line\n",line_number); } else if (line[0]=='/') { /* If a line starts with '/', it is a commment line, so * we ignore it */ } else { /* If we have a line to check, we check it according to the * dictionary type */ check_DELA_line(line,out,is_a_DELAF,line_number,alphabet,semantic_codes, inflectional_codes,simple_lemmas,compound_lemmas, &n_simple_entries,&n_compound_entries,alphabet0,strict_unprotected); } /* At regular intervals, we display a message on the standard * output to show that the program is working */ if (line_number%10000==0) { u_printf("%d lines read...\r",line_number); } line_number++; } u_printf("%d lines read\n",line_number-1); u_fclose(dic); /* * Once we have checked all the lines, we print some informations * in the output file. */ u_fprintf(out,"-----------------------------------\n"); u_fprintf(out,"------------- Stats -------------\n"); u_fprintf(out,"-----------------------------------\n"); if (skip_path != 0) { char filename_without_path[FILENAME_MAX]; remove_path(argv[vars->optind],filename_without_path); u_fprintf(out,"File: %s\n",filename_without_path); } else { u_fprintf(out,"File: %s\n",argv[vars->optind]); } u_fprintf(out,"Type: %s\n",is_a_DELAF?"DELAF":"DELAS"); u_fprintf(out,"%d line%s read\n",line_number-1,(line_number-1>1)?"s":""); u_fprintf(out,"%d simple entr%s ",n_simple_entries,(n_simple_entries>1)?"ies":"y"); u_fprintf(out,"for %d distinct lemma%s\n",simple_lemmas->size,(simple_lemmas->size>1)?"s":""); u_fprintf(out,"%d compound entr%s ",n_compound_entries,(n_compound_entries>1)?"ies":"y"); u_fprintf(out,"for %d distinct lemma%s\n",compound_lemmas->size,(compound_lemmas->size>1)?"s":""); /** * We print the list of the characters that are used, with * their unicode numbers shown in hexadecimal. This can be useful * to detect different characters that are graphically identical * like 'A' (upper of latin 'a' or upper of greek alpha ?). */ u_fprintf(out,"-----------------------------------\n"); u_fprintf(out,"---- All chars used in forms ----\n"); u_fprintf(out,"-----------------------------------\n"); unichar r[4]; unichar r2[7]; r[1]=' '; r[2]='('; r[3]='\0'; r2[5]='\n'; r2[6]='\0'; for (i=0;i<MAX_NUMBER_OF_UNICODE_CHARS;i++) { if (alphabet[i]) { u_fprintf(out,"%C (%04X)\n",i,i); } } /* * Then we print the list of all grammatical and semantic codes used in the * dictionary. If a code contains a non ASCII character, a space or a tabulation, * we print a warning. */ u_fprintf(out,"-------------------------------------------------------------\n"); u_fprintf(out,"---- %3d grammatical/semantic code%s",semantic_codes->size,(semantic_codes->size>1)?"s used in dictionary ----\n":" used in dictionary -----\n"); u_fprintf(out,"-------------------------------------------------------------\n"); unichar comment[2000]; for (i=0;i<semantic_codes->size;i++) { /* We print the code, followed if necessary by a warning */ u_fprintf(out,"%S",semantic_codes->value[i]); if (warning_on_code(semantic_codes->value[i],comment,space_warnings)) { u_fprintf(out," %S",comment); } u_fprintf(out,"\n"); } /* * Finally, we print the list of inflectional codes, * with warnings in the case of non ASCII letters, spaces * or tabulations. */ u_fprintf(out,"-----------------------------------------------------\n"); u_fprintf(out,"---- %3d inflectional code%s",inflectional_codes->size,(inflectional_codes->size>1)?"s used in dictionary ----\n":" used in dictionary -----\n"); u_fprintf(out,"-----------------------------------------------------\n"); for (i=0;i<inflectional_codes->size;i++) { u_fprintf(out,"%S",inflectional_codes->value[i]); if (warning_on_code(inflectional_codes->value[i],comment,space_warnings)) { u_fprintf(out," %S",comment); } u_fprintf(out,"\n"); } u_fclose(out); free_OptVars(vars); u_printf("Done.\n"); /* Note that we don't free anything since it would only waste time */ free(alphabet); if (alphabet0!=NULL) { free_alphabet(alphabet0); } #if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT)) /* cleanup for no leak on library */ free_string_hash(semantic_codes); free_string_hash(inflectional_codes); free_string_hash(simple_lemmas); free_string_hash(compound_lemmas); #endif return 0; }
/** * This function analyzes the given Elag rule automaton to find * where the rule and constraint parts are. As a side effect, it builds * a fst2 grammar ("foo.fst2" => "foo-conc.fst2") that can be used by * the Locate program to match the <!> .... <!> .... <!> part of the rule. */ void split_elag_rule(elRule* rule, const VersatileEncodingConfig* vec,language_t* language) { int c; /* This array contains the numbers of the states that are pointed to by * middle '<=>' of the constraints */ int constraints[ELAG_MAX_CONSTRAINTS]; int nbConstraints=count_constraints(rule->automaton,constraints); /* +1 because we have to count the <!> .... <!> .... <!> part of the rule */ rule->nbContexts=nbConstraints+1; rule->contexts=(elContext*)malloc(rule->nbContexts*sizeof(elContext)); if (rule->contexts==NULL) { fatal_alloc_error("split_elag_rule"); } for (c=0;c<rule->nbContexts;c++) { rule->contexts[c].left=NULL; rule->contexts[c].right=NULL; } int endR1=ELAG_UNDEFINED; int endR2=ELAG_UNDEFINED; int endC2=ELAG_UNDEFINED; for (Transition* t=rule->automaton->automaton->states[0]->outgoing_transitions;t!=NULL;t=t->next) { symbol_t* symbol=t->label; switch (symbol->type) { /* We split the unique <!> .... <!> .... <!> part */ case S_EXCLAM: if (rule->contexts[0].left!=NULL) { fatal_error("Too much '<!>' tags\n",rule->name); } rule->contexts[0].left=new_SingleGraph(PTR_TAGS); /* We look for the end of the first part of the rule */ endR1=get_sub_automaton(rule->automaton->automaton,rule->contexts[0].left,t->state_number,0,S_EXCLAM); rule->contexts[0].right=new_SingleGraph(PTR_TAGS); endR2=get_sub_automaton(rule->automaton->automaton,rule->contexts[0].right,endR1,0,S_EXCLAM); if (endR1==ELAG_UNDEFINED || endR2==ELAG_UNDEFINED || !is_final_state(rule->automaton->automaton->states[endR2])) { fatal_error("split_elag_rule: %s: parse error in <!> part\n",rule->name); } break; /* We split the nbConstraints <=> .... <=> .... <=> parts */ case S_EQUAL: if (rule->contexts[1].left!=NULL) { fatal_error("Non deterministic .fst2 file\n"); } for (c=0;c<nbConstraints;c++) { rule->contexts[c+1].left=new_SingleGraph(PTR_TAGS); get_sub_automaton(rule->automaton->automaton,rule->contexts[c+1].left,t->state_number,1,constraints[c]); rule->contexts[c+1].right=new_SingleGraph(PTR_TAGS); endC2=get_sub_automaton(rule->automaton->automaton,rule->contexts[c+1].right,constraints[c],0,S_EQUAL); if (endC2==ELAG_UNDEFINED || !is_final_state(rule->automaton->automaton->states[endC2])) { fatal_error("split_elag_rule: %s: parse error in <=> part\n",rule->name); } } break; default: fatal_error("Left delimitor '<!>' or '<=>' missing\n"); } } if (rule->contexts[0].left==NULL) { fatal_error("In grammar '%s': symbol '<!>' not found.\n",rule->name); } char buf[FILENAME_MAX]; remove_extension(rule->name,buf); strcat(buf,"-conc.fst2"); /* We create the.fst2 to be used by Locate */ Fst2Automaton* locate=make_locate_automaton(rule,language); save_automaton(locate,buf,vec,FST_LOCATE); free_Fst2Automaton(locate,free_symbol); }
int main_ConcorDiff(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int val,index=-1; char* out=NULL; char* font=NULL; int size=0; char foo; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_ConcorDiff,lopts_ConcorDiff,&index,vars))) { switch(val) { case 'o': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty output file\n"); } out=strdup(vars->optarg); if (out==NULL) { fatal_alloc_error("main_ConcorDiff"); } break; case 'f': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty font name\n"); } font=strdup(vars->optarg); if (font==NULL) { fatal_alloc_error("main_ConcorDiff"); } break; case 's': if (1!=sscanf(vars->optarg,"%d%c",&size,&foo) || size<=0) { /* foo is used to check that the font size is not like "45gjh" */ fatal_error("Invalid font size argument: %s\n",vars->optarg); } break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_ConcorDiff[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; } index=-1; } if (out==NULL) { fatal_error("You must specify the output file\n"); } if (font==NULL) { fatal_error("You must specify the font to use\n"); } if (size==0) { fatal_error("You must specify the font size to use\n"); } if (vars->optind!=argc-2) { error("Invalid arguments: rerun with --help\n"); return 1; } diff(encoding_output,bom_output,mask_encoding_compatibility_input,argv[vars->optind],argv[vars->optind+1],out,font,size); free(out); free(font); free_OptVars(vars); return 0; }
/** * This function produces a normalized version of 'input' and stores it into 'ouput'. * The following rules are applied in the given order: * * 1) If there is a { at the current position, we try to read a {S}, a {STOP} or * a tag token like {today,.ADV}. If we fail, we replace the { and the }, if any, * according to the replacement rules. Otherwise, we let the token unchanged. * 2) If there is one or more replacement rules that can apply to the current * position in 'input', then we apply the longest one. * 3) If we we find a separator (space, tab, new line) sequence, we replace it: * - by a new line if the sequence contains one and if 'carriage_return_policy' is * set to KEEP_CARRIAGE_RETURN; * - by a space otherwise. * 4) We copy the character that was read to the output. * * Note that 'replacements' is supposed to contain replacement rules for { and } */ int normalize(const char *fin, const char *fout, Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input, int carriage_return_policy, const char *rules) { U_FILE* input; input = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,fin,U_READ); if (input == NULL) { error("Cannot open file %s\n", fin); return 1; } U_FILE* output; output = u_fopen_creating_versatile_encoding(encoding_output,bom_output,fout,U_WRITE); if (output == NULL) { error("Cannot create file %s\n", fout); u_fclose(input); return 1; } struct string_hash* replacements=NULL; if(rules != NULL && rules[0]!='\0') { replacements=load_key_value_list(rules,mask_encoding_compatibility_input,'\t'); if (replacements==NULL) { error("Cannot load replacement rules file %s\n", rules); replacements=new_string_hash(); } } /* If there is no replacement rules file, we simulate one */ else { replacements=new_string_hash(); } /* If there is a replacement rule file, we ensure that there are replacement * rules for { and }. If not, we add our default ones, so that in any case, * we are sure to have rules for { and } */ unichar key[2]; unichar value[2]; u_strcpy(key,"{"); u_strcpy(value,"["); get_value_index(key,replacements,INSERT_IF_NEEDED,value); u_strcpy(key,"}"); u_strcpy(value,"]"); get_value_index(key,replacements,INSERT_IF_NEEDED,value); struct OUTBUF OutBuf; OutBuf.pos=0; unichar tmp[MAX_TAG_LENGTH]; //struct buffer* buffer=new_buffer_for_file(UNICHAR_BUFFER,input); long save_pos=ftell(input); fseek(input,0,SEEK_END); long file_size_input=ftell(input); fseek(input,save_pos,SEEK_SET); int line_buffer_size = (int)(((file_size_input+1) < MAX_LINE_BUFFER_SIZE) ? (file_size_input+1) : MAX_LINE_BUFFER_SIZE); unichar *line_read; line_read=(unichar*)malloc((line_buffer_size+0x10)*sizeof(unichar)); if (line_read==NULL) { fatal_alloc_error("normalize"); } /* We define some things that will be used for parsing the buffer */ static const unichar stop_chars[]= { '{', '}', 0 }; static const unichar forbidden_chars[]= { '\n', 0 }; static const unichar open_bracket[]= { '{', 0 }; static const unichar close_bracket[]= { '}', 0 }; static const unichar empty_string[]= { 0 }; int corrupted_file=0; int eof_found=0; /* First, we fill the buffer */ int lastline_was_terminated=0; while (eof_found==0) { int current_start_pos=0; int found_null=0; const unichar*buff=line_read; int result_read = 0; result_read = u_fgets_treat_cr_as_lf(line_read,line_buffer_size,input,1,&found_null); if ((found_null != 0) && (corrupted_file==0)) { corrupted_file=1; error("Corrupted text file containing NULL characters!\n"); error("They have been ignored by Normalize, but you should clean your text\n"); } if (result_read>0) if (line_read[result_read-1]==0x0d) line_read[result_read-1]='\n'; if (result_read==EOF) break; if (lastline_was_terminated != 0) while (current_start_pos<result_read) { if (buff[current_start_pos]!=' ' && buff[current_start_pos]!='\t' && buff[current_start_pos]!=0x0d && buff[current_start_pos]!='\n') break; current_start_pos++; } lastline_was_terminated = 0; if (result_read > 0) if ((buff[result_read-1]=='\n') || (buff[result_read-1]==0x0d)) lastline_was_terminated = 1; while (current_start_pos<result_read) { if ((lastline_was_terminated == 0) && (eof_found == 0) && (current_start_pos + MINIMAL_CHAR_IN_BUFFER_BEFORE_CONTINUE_LINE >= result_read)) { int i; int nb_to_keep = result_read-current_start_pos; for (i=0;i<nb_to_keep;i++) line_read[i]=line_read[current_start_pos+i]; int found_null_read=0; int result_read_continue = u_fgets_treat_cr_as_lf(line_read+nb_to_keep,line_buffer_size-nb_to_keep,input,1,&found_null_read); if ((found_null_read != 0) && (corrupted_file==0)) { corrupted_file=1; error("Corrupted text file containing NULL characters!\n"); error("They have been ignored by Normalize, but you should clean your text\n"); } if (result_read_continue>0) if (line_read[(result_read_continue+nb_to_keep)-1]==0x0d) line_read[(result_read_continue+nb_to_keep)-1]='\n'; lastline_was_terminated = 0; if (result_read_continue==EOF) eof_found = lastline_was_terminated = 1; if (result_read_continue > 0) if ((buff[(result_read_continue+nb_to_keep)-1]=='\n') || (buff[(result_read_continue+nb_to_keep)-1]==0x0d)) lastline_was_terminated = 1; result_read = nb_to_keep; current_start_pos = 0; if (result_read_continue > 0) result_read += result_read_continue; } if (buff[current_start_pos]=='{') { /* If we have a {, we try to find a sequence like {....}, that does not contain * new lines. If the sequence contains protected character, we want to keep them * protected. */ int old_position=current_start_pos; /* If we don't increase the position, the parse will stop on the initial { */ current_start_pos++; tmp[0]='{'; int code=parse_string(buff,¤t_start_pos,&(tmp[1]),stop_chars,forbidden_chars,NULL); if (code==P_FORBIDDEN_CHAR || code==P_BACKSLASH_AT_END || buff[current_start_pos]!='}') { /* If we have found a new line or a {, or if there is * a backslash at the end of the buffer, or if we have reached the end * of the buffer, we assume that the initial * { was not a tag beginning, so we print the substitute of { */ WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0); /* And we rewind the current position after the { */ current_start_pos=old_position+1; } else { /* If we have read a sequence like {....}, we assume that there won't be * a buffer overflow if we add the } */ u_strcat(tmp,close_bracket); if (!u_strcmp(tmp,"{S}") || !u_strcmp(tmp,"{STOP}") || check_tag_token(tmp)) { /* If this is a special tag or a valid tag token, we just print * it to the output */ WriteOufBuf(&OutBuf,tmp,output, 0); current_start_pos++; } else { /* If we have a non valid tag token, we print the equivalent of { * and we rewind the current position after the { */ WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0); current_start_pos=old_position+1; } } } else { /* If we have a character that is not {, first we try to look if there * is a replacement to do */ int key_length; int index=get_longest_key_index(&buff[current_start_pos],&key_length,replacements); if (index!=NO_VALUE_INDEX) { /* If there is something to replace */ WriteOufBuf(&OutBuf,replacements->value[index],output, 0); current_start_pos=current_start_pos+key_length; } else { if (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { /* If we have a separator, we try to read the longest separator sequence * that we can read. By the way, we note if it contains a new line */ int new_line=0; while (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { /* Note 1: no bound check is needed, since an unichar buffer is always * ended by a \0 * * Note 2: we don't take into account the case of a buffer ended by * separator while it's not the end of file: that would mean * that the text contains something like MARGIN_BEFORE_BUFFER_END * contiguous separators. Such a text would not be a reasonable one. */ if (buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { new_line=1; } current_start_pos++; } if (new_line && (carriage_return_policy==KEEP_CARRIAGE_RETURN)) { /* We print a new line if the sequence contains one and if we are * allowed to; otherwise, we print a space. */ WriteOufBuf(&OutBuf,'\n',output, 0); } else { WriteOufBuf(&OutBuf,' ',output, 0); } } else { /* If, finally, we have a normal character to normalize, we just print it */ WriteOufBuf(&OutBuf,buff[current_start_pos++],output, 0); } } } } } WriteOufBuf(&OutBuf,empty_string,output, 1); free(line_read); free_string_hash(replacements); u_fclose(input); u_fclose(output); return 0; }
///////////////////////////////////////////////////////////////////////////////// // Scans a single unit from a DELAC entry. 'line' is non terminated by a newline. // Initially, 'u' has its space allocated but is empty. // Returns the length of the scanned sequence, -1 if a format error occurred, -2 if a memory allocation problem occured. int DLC_scan_unit(Alphabet* alph,struct l_morpho_t* pL_MORPHO,SU_id_T* u, unichar* line, d_class_equiv_T* D_CLASS_EQUIV) { int l; //length of the scanned sequence int pos; //index of the next caracter to be scanned unichar tmp[DIC_LINE_SIZE]; pos = 0; //Scan a unit l = SU_get_unit(tmp, line, DIC_LINE_SIZE - 1, alph, 0); //The single word module determines what is a word and what is a separator, etc. if (l <= 0) { return -1; } u->form = u_strdup(tmp); pos += l; //If no lemma indication if (line[pos] != (unichar) '(') { u->lemma = NULL; u->feat = NULL; } //Scan the unit's description contained between '(' and ')' else { pos++; //Omit the '(' //Scan the lemma if any u->lemma = (SU_lemma_T*) malloc(sizeof(SU_lemma_T)); if (!u->lemma) { fatal_alloc_error("DLC_scan_unit"); } l = SU_get_unit(tmp, &(line[pos]), DIC_LINE_SIZE - 1, alph, 0); //The single word module determines what is a word and what is a separator, etc. if (l < 0) { free(u->form); SU_delete_lemma(u->lemma); return l; } u->lemma->unit = u_strdup(tmp); pos += l; //Scan the lemma's inflection paradigm if (line[pos] != (unichar) '.') { error("Dot missing after a unit's lemma:\n%S\n", line); free(u->form); SU_delete_lemma(u->lemma); return -1; } pos++; //Omit the dot unichar u_para[DIC_LINE_SIZE]; l = u_scan_until_char(u_para, &(line[pos]), DIC_LINE_SIZE - 1, "+:\\", 1); if (!l) { error( "Unit's inflection paradigm non existent in DELAC line:\n%S\n", line); free(u->form); SU_delete_lemma(u->lemma); return -1; } u->lemma->paradigm = (char*) malloc((u_strlen(u_para) + 1) * sizeof(char)); if (!u->lemma->paradigm) { fatal_alloc_error("DLC_scan_unit"); } for (unsigned int c = 0; c <= u_strlen(u_para); c++) u->lemma->paradigm[c] = (char) u_para[c]; //Determine the lemma's inflection class (noun, adj, etc.) l_class_T* cl; cl = DLC_class_para(u_para, D_CLASS_EQUIV); if (!cl) { error( "Impossible to deduce the unit's inflection class (noun, adj, etc.):\n%S\n", line); free(u->form); SU_delete_lemma(u->lemma); return -1; } u->lemma->cl = cl; pos += l; //Scan the unit's inflection features unichar tmp_scan[DIC_LINE_SIZE]; if (line[pos] != (unichar) ':') { error("Colon missing after a unit's lemma:\n%S\n", line); free(u->form); SU_delete_lemma(u->lemma); return -1; } pos++; //Omit the colon l = u_scan_until_char(tmp_scan, &(line[pos]), DIC_LINE_SIZE - 1, ")", 1); if (l <= 0) { error("Inflection features missing after ':' for a unit:\n%S\n", line); free(u->form); SU_delete_lemma(u->lemma); return -1; } pos += l; if (line[pos] != (unichar) ')') { error("')' missing after a unit's inflection features:\n%S\n", line); free(u->form); SU_delete_lemma(u->lemma); return -1; } pos++; //Omit the ')' u->feat = d_get_feat_str(pL_MORPHO,tmp_scan); if (!u->feat) { error("Incorrect inflection features in a unit:\n%S\n", line); free(u->form); SU_delete_lemma(u->lemma); return -1; } } return pos; }
void scan_graph(int n_graph, // number of current graph int e, // number of current state int pos, // int depth, struct parsing_info** liste_arrivee, unichar* mot_token_buffer, struct fst2txt_parameters* p,Abstract_allocator prv_alloc_recycle) { Fst2State etat_courant=p->fst2->states[e]; if (depth > MAX_DEPTH) { error( "\n" "Maximal stack size reached in graph %i!\n" "Recognized more than %i tokens starting from:\n" " ", n_graph, MAX_DEPTH); for (int i=0; i<60; i++) { error("%S",p->buffer[p->current_origin+i]); } error("\nSkipping match at this position, trying from next token!\n"); p->output[0] = '\0'; // clear output p->input_length = 0; // reset taille_entree empty(p->stack); // clear output stack if (liste_arrivee != NULL) { while (*liste_arrivee != NULL) { // free list of subgraph matches struct parsing_info* la_tmp=*liste_arrivee; *liste_arrivee=(*liste_arrivee)->next; la_tmp->next=NULL; // to don't free the next item free_parsing_info(la_tmp, prv_alloc_recycle); } } return; // exit(1); // don't exit, try at next position } depth++; if (is_final_state(etat_courant)) { // if we are in a final state p->stack->stack[p->stack->stack_pointer+1]='\0'; if (n_graph == 0) { // in main graph if (pos>=p->input_length/*sommet>u_strlen(output)*/) { // and if the recognized input is longer than the current one, it replaces it u_strcpy(p->output,p->stack->stack); p->input_length=(pos); } } else { // in a subgraph (*liste_arrivee)=insert_if_absent(pos,-1,-1,(*liste_arrivee),p->stack->stack_pointer+1, p->stack->stack,p->variables,NULL,NULL,-1,-1,NULL,-1, prv_alloc_recycle); } } if (pos+p->current_origin==p->text_buffer->size) { // if we are at the end of the text, we return return; } int SOMMET=p->stack->stack_pointer+1; int pos2; /* If there are some letter sequence transitions like %hello, we process them */ if (p->token_tree[e]->transition_array!=NULL) { if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} /* we don't keep this line because of problems occur in sentence tokenizing * if the return sequence is defautly considered as a separator like space else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} */ else pos2=pos; int position=0; unichar *token=mot_token_buffer; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || (is_letter(p->buffer[pos2+p->current_origin],p->alphabet) && (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet)))) { /* If we are in character by character mode */ while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { token[position++]=p->buffer[(pos2++)+p->current_origin]; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION) { break; } } token[position]='\0'; if (position!=0 && (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || !(is_letter(token[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)))) { // we proceed only if we have exactly read the contenu sequence // in both modes MERGE and REPLACE, we process the transduction if any int SOMMET2=p->stack->stack_pointer; Transition* RES=get_matching_tags(token,p->token_tree[e],p->alphabet); Transition* TMP; unichar* mot_token_new_recurse_buffer=NULL; if (RES!=NULL) { // we allocate a new mot_token_buffer for the scan_graph recursin because we need preserve current // token=mot_token_buffer mot_token_new_recurse_buffer=(unichar*)malloc(MOT_BUFFER_TOKEN_SIZE*sizeof(unichar)); if (mot_token_new_recurse_buffer==NULL) { fatal_alloc_error("scan_graph"); } } while (RES!=NULL) { p->stack->stack_pointer=SOMMET2; Fst2Tag etiq=p->fst2->tags[RES->tag_number]; traiter_transduction(p,etiq->output); int longueur=u_strlen(etiq->input); unichar C=token[longueur]; token[longueur]='\0'; if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,token,0); } token[longueur]=C; scan_graph(n_graph,RES->state_number,pos2-(position-longueur),depth,liste_arrivee,mot_token_new_recurse_buffer,p); TMP=RES; RES=RES->next; free(TMP); } if (mot_token_new_recurse_buffer!=NULL) { free(mot_token_new_recurse_buffer); } } } } Transition* t=etat_courant->transitions; while (t!=NULL) { p->stack->stack_pointer=SOMMET-1; // we process the transition of the current state int n_etiq=t->tag_number; if (n_etiq<0) { // case of a sub-graph struct parsing_info* liste=NULL; unichar* pile_old; p->stack->stack[p->stack->stack_pointer+1]='\0'; pile_old = u_strdup(p->stack->stack); scan_graph((((unsigned)n_etiq)-1),p->fst2->initial_states[-n_etiq],pos,depth,&liste,mot_token_buffer,p); while (liste!=NULL) { p->stack->stack_pointer=liste->stack_pointer-1; u_strcpy(p->stack->stack,liste->stack); scan_graph(n_graph,t->state_number,liste->position,depth,liste_arrivee,mot_token_buffer,p); struct parsing_info* l_tmp=liste; liste=liste->next; l_tmp->next=NULL; // to don't free the next item free_parsing_info(l_tmp, prv_alloc_recycle); } u_strcpy(p->stack->stack,pile_old); free(pile_old); p->stack->stack_pointer=SOMMET-1; } else { // case of a normal tag Fst2Tag etiq=p->fst2->tags[n_etiq]; unichar* contenu=etiq->input; int contenu_len_possible_match=u_len_possible_match(contenu); if (etiq->type==BEGIN_OUTPUT_VAR_TAG) { fatal_error("Unsupported $|XXX( tags in Fst2Txt\n"); } if (etiq->type==END_OUTPUT_VAR_TAG) { fatal_error("Unsupported $|XXX) tags in Fst2Txt\n"); } if (etiq->type==BEGIN_VAR_TAG) { // case of a $a( variable tag //int old; struct transduction_variable* L=get_transduction_variable(p->variables,etiq->variable); if (L==NULL) { fatal_error("Unknown variable: %S\n",etiq->variable); } //old=L->start; if (p->buffer[pos+p->current_origin]==' ' && pos+p->current_origin+1<p->text_buffer->size) { pos2=pos+1; if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' '); } //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; L->start_in_tokens=pos2; scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); //L->start=old; } else if (etiq->type==END_VAR_TAG) { // case of a $a) variable tag //int old; struct transduction_variable* L=get_transduction_variable(p->variables,etiq->variable); if (L==NULL) { fatal_error("Unknown variable: %S\n",etiq->variable); } //old=L->end; if (pos>0) L->end_in_tokens=pos-1; else L->end_in_tokens=pos; // BUG: qd changement de buffer, penser au cas start dans ancien buffer et end dans nouveau scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p); //L->end=old; } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MOT_LN5))) { // case of transition by any sequence of letters if (p->buffer[pos+p->current_origin]==' ' && pos+p->current_origin+1<p->text_buffer->size) { pos2=pos+1; if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' '); } //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || ((pos2+p->current_origin)==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) { while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0) { // we proceed only if we have read a letter sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_output_string(p->stack,mot); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==4) && (!u_trymatch_superfast4(contenu,ETIQ_NB_LN4))) { // case of transition by any sequence of digits if (p->buffer[pos+p->current_origin]==' ') { pos2=pos+1; if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' '); } //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; while (pos2+p->current_origin<p->text_buffer->size && (p->buffer[pos2+p->current_origin]>='0') && (p->buffer[pos2+p->current_origin]<='9')) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0) { // we proceed only if we have read a letter sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_output_string(p->stack,mot); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MAJ_LN5))) { // case of upper case letter sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || ((pos2+p->current_origin)==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) { while (pos2+p->current_origin<p->text_buffer->size && is_upper(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // we proceed only if we have read an upper case letter sequence // which is not followed by a lower case letter // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MIN_LN5))) { // case of lower case letter sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) { while (pos2+p->current_origin<p->text_buffer->size && is_lower(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // we proceed only if we have read a lower case letter sequence // which is not followed by an upper case letter // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_PRE_LN5))) { // case of a sequence beginning by an upper case letter if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || (is_upper(p->buffer[pos2+p->current_origin],p->alphabet) && (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet)))) { while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // we proceed only if we have read a letter sequence // which is not followed by a letter // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_PNC_LN5))) { // case of a punctuation sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar C=p->buffer[pos2+p->current_origin]; if (C==';' || C=='!' || C=='?' || C==':' || C==0xbf || C==0xa1 || C==0x0e4f || C==0x0e5a || C==0x0e5b || C==0x3001 || C==0x3002 || C==0x30fb) { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,C); } scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p); } else { // we consider the case of ... // BUG: if ... appears at the end of the buffer if (C=='.') { if ((pos2+p->current_origin+2)<p->text_buffer->size && p->buffer[pos2+p->current_origin+1]=='.' && p->buffer[pos2+p->current_origin+2]=='.') { traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the ... we have read push(p->stack,C);push(p->stack,C);push(p->stack,C); } scan_graph(n_graph,t->state_number,pos2+3,depth,liste_arrivee,mot_token_buffer,p); } else { // we consider the . as a normal punctuation sign traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,C); } scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p); } } } } else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast3(contenu,ETIQ_E_LN3))) { // case of an empty sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p); } else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast3(contenu,ETIQ_CIRC_LN3))) { // case of a new line sequence if (p->buffer[pos+p->current_origin]=='\n') { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,'\n'); } scan_graph(n_graph,t->state_number,pos+1,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==1) && (!u_trymatch_superfast1(contenu,'#')) && (!(etiq->control&RESPECT_CASE_TAG_BIT_MASK))) { // case of a no space condition if (p->buffer[pos+p->current_origin]!=' ') { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==1) && (!u_trymatch_superfast1(contenu,' '))) { // case of an obligatory space if (p->buffer[pos+p->current_origin]==' ') { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,' '); } scan_graph(n_graph,t->state_number,pos+1,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast5(contenu,ETIQ_L_LN3))) { // case of a single letter if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; if (is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,p->buffer[pos2+p->current_origin]); } scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p); } } else { // case of a normal letter sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; if (etiq->control&RESPECT_CASE_TAG_BIT_MASK) { // case of exact case match int position=0; while (pos2+p->current_origin<p->text_buffer->size && p->buffer[pos2+p->current_origin]==contenu[position]) { pos2++; position++; } if (contenu[position]=='\0' && position!=0 && !(is_letter(contenu[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet))) { // we proceed only if we have exactly read the contenu sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,contenu,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } else { // case of variable case match // the letter sequences may have been caught by the arbre_etiquette structure int position=0; unichar* mot=mot_token_buffer; while (pos2+p->current_origin<p->text_buffer->size && is_equal_or_uppercase(contenu[position],p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (contenu[position]=='\0' && position!=0 && !(is_letter(contenu[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet))) { // we proceed only if we have exactly read the contenu sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } } t=t->next; } }
///////////////////////////////////////////////////////////////////////////////// // Inflect a DELAS/DELAC into a DELAF/DELACF. // On error returns 1, 0 otherwise. int inflect(char* DLC, char* DLCF, MultiFlex_ctx* p_multiFlex_ctx, struct l_morpho_t* pL_MORPHO, Alphabet* alph, Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input, int config_files_status, d_class_equiv_T* D_CLASS_EQUIV, int error_check_status, Korean* korean,const char* pkgdir) { U_FILE *dlc, *dlcf; //DELAS/DELAC and DELAF/DELACF files unichar input_line[DIC_LINE_SIZE]; //current DELAS/DELAC line unichar output_line[DIC_LINE_SIZE]; //current DELAF/DELACF line int l; //length of the line scanned DLC_entry_T* dlc_entry; MU_forms_T MU_forms; //inflected forms of the MWU int err; //Open DELAS/DELAC dlc = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input, DLC, U_READ); if (!dlc) { return 1; } //Open DELAF/DELACF dlcf = u_fopen_creating_versatile_encoding(encoding_output, bom_output, DLCF, U_WRITE); if (!dlcf) { error("Unable to open file: '%s' !\n", DLCF); return 1; } //Inflect one entry at a time l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc); //Omit the final newline u_chomp_new_line(input_line); int flag = 0; //If a line is empty the file is not necessarily finished. //If the last entry has no newline, we should not skip this entry struct dela_entry* DELAS_entry; int semitic; int current_line=0; while (l != EOF) { current_line++; DELAS_entry = is_strict_DELAS_line(input_line, alph); if (DELAS_entry != NULL) { /* If we have a strict DELAS line, that is to say, one with * a simple word */ if (error_check_status==ONLY_COMPOUND_WORDS) { error("Unexpected simple word forbidden by -c:\n%S\n",input_line); free_dela_entry(DELAS_entry); goto next_line; } SU_forms_T forms; SU_init_forms(&forms); //Allocate the space for forms and initialize it to null values char inflection_code[1024]; unichar code_gramm[1024]; /* We take the first grammatical code, and we extract from it the name * of the inflection transducer to use */ get_inflection_code(DELAS_entry->semantic_codes[0], inflection_code, code_gramm, &semitic); /* And we inflect the word */ // err=SU_inflect(DELAS_entry->lemma,inflection_code,&forms,semitic); err = SU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output,mask_encoding_compatibility_input,DELAS_entry->lemma, inflection_code, DELAS_entry->filters, &forms, semitic, korean,pkgdir); #ifdef __GNUC__ #warning mettre toutes les entrees sur une meme ligne #elif ((defined(__VISUALC__)) || defined(_MSC_VER)) #pragma message("warning : mettre toutes les entrees sur une meme ligne") #endif /* Then, we print its inflected forms to the output */ for (int i = 0; i < forms.no_forms; i++) { unichar foo[1024]; if (korean!=NULL) { Hanguls_to_Jamos(forms.forms[i].form,foo,korean,1); } else { u_strcpy(foo,forms.forms[i].form); } u_fprintf(dlcf, "%S,%S.%S", foo/*forms.forms[i].form*/, DELAS_entry->lemma, code_gramm); /* We add the semantic codes, if any */ for (int j = 1; j < DELAS_entry->n_semantic_codes; j++) { u_fprintf(dlcf, "+%S", DELAS_entry->semantic_codes[j]); } if (forms.forms[i].local_semantic_code != NULL) { u_fprintf(dlcf, "%S", forms.forms[i].local_semantic_code); } if (forms.forms[i].raw_features != NULL && forms.forms[i].raw_features[0] != '\0') { u_fprintf(dlcf, ":%S", forms.forms[i].raw_features); } u_fprintf(dlcf, "\n"); } SU_delete_inflection(&forms); free_dela_entry(DELAS_entry); /* End of simple word case */ } else { /* If we have not a simple word DELAS line, we try to analyse it * as a compound word DELAC line */ if (error_check_status==ONLY_SIMPLE_WORDS) { error("Unexpected compound word forbidden by -s:\n%S\n",input_line); goto next_line; } if (config_files_status != CONFIG_FILES_ERROR) { /* If this is a compound word, we process it if and only if the * configuration files have been correctly loaded */ dlc_entry = (DLC_entry_T*) malloc(sizeof(DLC_entry_T)); if (!dlc_entry) { fatal_alloc_error("inflect"); } /* Convert a DELAC entry into the internal multi-word format */ err = DLC_line2entry(alph,pL_MORPHO,input_line, dlc_entry, D_CLASS_EQUIV); if (!err) { //Inflect the entry MU_init_forms(&MU_forms); err = MU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output, mask_encoding_compatibility_input,dlc_entry->lemma, &MU_forms,pkgdir); if (!err) { int f; //index of the current inflected form //Inform the user if no form generated if (MU_forms.no_forms == 0) { error("No inflected form could be generated for "); DLC_print_entry(pL_MORPHO,dlc_entry); } //Print inflected forms for (f = 0; f < MU_forms.no_forms; f++) { //Format the inflected form to the DELACF format err = DLC_format_form(pL_MORPHO,output_line, DIC_LINE_SIZE - 1, MU_forms.forms[f], dlc_entry, D_CLASS_EQUIV); if (!err) { //Print one inflected form at a time to the DELACF file u_fprintf(dlcf, "%S\n", output_line); } } } MU_delete_inflection(&MU_forms); DLC_delete_entry(dlc_entry); } } else { /* We try to inflect a compound word whereas the "Morphology.txt" and/or * "Equivalences.txt" file(s) has/have not been loaded */ if (!flag) { /* We use a flag to print the error message only once */ error( "WARNING: Compound words won't be inflected because configuration files\n"); error(" have not been correctly loaded.\n"); flag = 1; } } } next_line: //Get next entry l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc); if (l!=EOF) { //Omit the final newline u_chomp_new_line(input_line); if (input_line[0]=='\0') { /* If we find an empty line, then we go on */ goto next_line; } } } u_fclose(dlc); u_fclose(dlcf); return 0; }
///////////////////////////////////////////////////////////////////////////////// // Converts a DELAC line ('line') into a structured DELAC entry ('entry'). // 'line' is non terminated by a newline. // Initially, entry has its space allocated but is empty. // Returns 1 if 'line' is empty, 2 if its format is incorrect, -1 if memory allocation problems, 0 otherwise. int DLC_line2entry(Alphabet* alph,struct l_morpho_t* pL_MORPHO,unichar* line, DLC_entry_T* entry, d_class_equiv_T* D_CLASS_EQUIV) { int l; //length of the scanned sequence int pos; //index of the next character to be read SU_id_T* unit; pos = 0; if (!line[pos]) //Empty line return 1; //Initalize the lemma entry->lemma = (MU_lemma_T*) malloc(sizeof(MU_lemma_T)); if (!entry->lemma) { fatal_alloc_error("DLC_line2entry"); } entry->lemma->no_units = 0; //Scan the single units while (line[pos] && line[pos] != (unichar) ',') { //Each DELAC line must contain a comma unit = (SU_id_T*) malloc(sizeof(SU_id_T)); if (!unit) { fatal_alloc_error("DLC_line2entry"); } l = DLC_scan_unit(alph,pL_MORPHO,unit, &(line[pos]), D_CLASS_EQUIV); if (l <= 0) { free(unit); MU_delete_lemma(entry->lemma); return 2; } entry->lemma->units[entry->lemma->no_units] = unit; entry->lemma->no_units++; pos += l; } if (line[pos] != (unichar) ',') { error("Comma missing in DELAC line:\n%S\n", line); MU_delete_lemma(entry->lemma); return 2; } //Scan the inflection paradigm unichar tmp[DIC_LINE_SIZE]; pos++; //Omit the comma l = u_scan_until_char(tmp, &(line[pos]), DIC_LINE_SIZE - 1, "+:)\\/", 1); pos += l; if (!l) { error("Inflection paradigm inexistent in line:\n%S\n", line); MU_delete_lemma(entry->lemma); return 2; } entry->lemma->paradigm = (char*) malloc((u_strlen(tmp) + 1) * sizeof(char)); if (!entry->lemma->paradigm) { fatal_alloc_error("DLC_line2entry"); } for (unsigned int c = 0; c <= u_strlen(tmp); c++) //Convert to char and copy entry->lemma->paradigm[c] = (char) tmp[c]; //Determine the class (e.g. noun) l_class_T* cl; cl = DLC_class_para(tmp, D_CLASS_EQUIV); if (!cl) { error( "Impossible to deduce the compound's inflection class (noun, adj, etc.):\n%S\n", line); MU_delete_lemma(entry->lemma); return 2; } entry->lemma->cl = cl; //Scan the semantic codes l = DLC_scan_codes(entry->codes, &(line[pos])); pos += l; //Scan the comment l = DLC_scan_comment(&(entry->comment), &(line[pos])); pos += l; if (line[pos]) { error("Bad format in DELAC line:\n%S\n", line); MU_delete_lemma(entry->lemma); //delete lemma for (int c = 0; entry->codes[c]; c++) //delete codes free(entry->codes[c]); free(entry->comment); //delete comment return 2; } return 0; }
/** * Allocates, initializes and returns a new locate_parameters structure. */ struct locate_parameters* new_locate_parameters() { struct locate_parameters* p=(struct locate_parameters*)malloc(sizeof(struct locate_parameters)); if (p==NULL) { fatal_alloc_error("new_locate_parameters"); } p->tilde_negation_operator=1; p->useLocateCache=1; p->token_control=NULL; p->matching_patterns=NULL; p->current_compound_pattern=0; p->pattern_tree_root=NULL; /* We use -1 because there may be no space, {S} or {STOP} in the text */ p->SPACE=-1; p->SENTENCE=-1; p->STOP=-1; p->tag_token_list=NULL; #ifdef TRE_WCHAR p->filters=NULL; p->filter_match_index=NULL; #endif p->DLC_tree=NULL; p->optimized_states=NULL; p->fst2=NULL; p->tokens=NULL; p->current_origin=-1; p->max_count_call=0; p->max_count_call_warning=0; p->buffer=NULL; p->tokenization_policy=WORD_BY_WORD_TOKENIZATION; p->space_policy=DONT_START_WITH_SPACE; p->matching_units=0; p->match_policy=LONGEST_MATCHES; p->output_policy=IGNORE_OUTPUTS; p->ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS; p->variable_error_policy=IGNORE_VARIABLE_ERRORS; p->match_list=NULL; p->number_of_matches=0; p->number_of_outputs=0; p->start_position_last_printed_match=-1; p->end_position_last_printed_match=-1; p->search_limit=0; p->input_variables=NULL; p->output_variables=NULL; p->nb_output_variables=0; p->stack=new_stack_unichar(TRANSDUCTION_STACK_SIZE); p->alphabet=NULL; p->morpho_dic_inf=NULL; p->morpho_dic_inf_free=NULL; p->morpho_dic_bin=NULL; p->morpho_dic_bin_free=NULL; p->n_morpho_dics=0; p->dic_variables=NULL; p->left_ctx_shift=0; p->left_ctx_base=0; p->protect_dic_chars=0; p->graph_depth=0; p->korean=NULL; p->jamo_tags=NULL; p->mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; p->recyclable_wchart_buffer=(wchar_t*)malloc(sizeof(wchar_t)*SIZE_RECYCLABLE_WCHAR_T_BUFFER); if (p->recyclable_wchart_buffer==NULL) { fatal_alloc_error("new_locate_parameters"); } p->recyclable_unichar_buffer=(unichar*)malloc(sizeof(unichar)*SIZE_RECYCLABLE_UNICHAR_BUFFER); if (p->recyclable_unichar_buffer==NULL) { fatal_alloc_error("new_locate_parameters"); } p->size_recyclable_unichar_buffer = SIZE_RECYCLABLE_UNICHAR_BUFFER; p->failfast=NULL; p->match_cache_first=NULL; p->match_cache_last=NULL; p->match_cache=NULL; p->prv_alloc=NULL; p->prv_alloc_recycle=NULL; p->token_error_ctx.last_length=0; p->token_error_ctx.last_start=0; p->token_error_ctx.n_errors=0; p->token_error_ctx.n_matches_at_token_pos__locate=0; p->token_error_ctx.n_matches_at_token_pos__morphological_locate=0; p->counting_step.count_call=0; p->counting_step.count_cancel_trying=0; p->explore_depth=0; p->backup_memory_reserve=NULL; p->cached_match_vector=new_vector_ptr(16); p->fnc_locate_trace_step=NULL; p->private_param_locate_trace=NULL; memset(&(p->arabic),0,sizeof(ArabicTypoRules)); p->is_in_cancel_state = 0; p->is_in_trace_state = 0; p->counting_step_count_cancel_trying_real_in_debug_or_trace = 0; return p; }
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err, const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy, Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input, const char* dynamicDir,TokenizationPolicy tokenization_policy, SpacePolicy space_policy,int search_limit,const char* morpho_dic_list, AmbiguousOutputPolicy ambiguous_output_policy, VariableErrorPolicy variable_error_policy,int protect_dic_chars, int is_korean,int max_count_call,int max_count_call_warning, char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) { U_FILE* out; U_FILE* info; struct locate_parameters* p=new_locate_parameters(); p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0); p->buffer=(int*)af_get_mapfile_pointer(p->text_cod); long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int); p->buffer_size=(int)text_size; p->tilde_negation_operator=tilde_negation_operator; p->useLocateCache=useLocateCache; if (max_count_call == -1) { max_count_call = (int)text_size; } if (max_count_call_warning == -1) { max_count_call_warning = (int)text_size; } p->match_policy=match_policy; p->tokenization_policy=tokenization_policy; p->space_policy=space_policy; p->output_policy=output_policy; p->search_limit=search_limit; p->ambiguous_output_policy=ambiguous_output_policy; p->variable_error_policy=variable_error_policy; p->protect_dic_chars=protect_dic_chars; p->mask_encoding_compatibility_input = mask_encoding_compatibility_input; p->max_count_call = max_count_call; p->max_count_call_warning = max_count_call_warning; p->token_filename = tokens; char concord[FILENAME_MAX]; char concord_info[FILENAME_MAX]; strcpy(concord,dynamicDir); strcat(concord,"concord.ind"); strcpy(concord_info,dynamicDir); strcat(concord_info,"concord.n"); char morpho_bin[FILENAME_MAX]; strcpy(morpho_bin,dynamicDir); strcat(morpho_bin,"morpho.bin"); if (arabic_rules!=NULL && arabic_rules[0]!='\0') { load_arabic_typo_rules(arabic_rules,&(p->arabic)); } out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE); if (out==NULL) { error("Cannot write %s\n",concord); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); u_fclose(out); return 0; } info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE); if (info==NULL) { error("Cannot write %s\n",concord_info); } switch(output_policy) { case IGNORE_OUTPUTS: u_fprintf(out,"#I\n"); break; case MERGE_OUTPUTS: u_fprintf(out,"#M\n"); break; case REPLACE_OUTPUTS: u_fprintf(out,"#R\n"); break; } if (alphabet!=NULL && alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); p->alphabet=load_alphabet(alphabet,is_korean); if (p->alphabet==NULL) { error("Cannot load alphabet file %s\n",alphabet); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } } struct string_hash* semantic_codes=new_string_hash(); extract_semantic_codes(dlf,semantic_codes); extract_semantic_codes(dlc,semantic_codes); if (is_cancelling_requested() != 0) { error("user cancel request.\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } u_printf("Loading fst2...\n"); struct FST2_free_info fst2load_free; Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free); if (fst2load==NULL) { error("Cannot load grammar %s\n",fst2_name); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered); p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator); free_abstract_Fst2(fst2load,&fst2load_free); if (is_cancelling_requested() != 0) { error("User cancel request..\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } p->tags=p->fst2->tags; #ifdef TRE_WCHAR p->filters=new_FilterSet(p->fst2,p->alphabet); if (p->filters==NULL) { error("Cannot compile filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_stack_unichar(p->stack); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif u_printf("Loading token list...\n"); int n_text_tokens=0; p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens); if (p->tokens==NULL) { error("Cannot load token list %s\n",tokens); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator; p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator); memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache)); if (p->match_cache==NULL) { fatal_alloc_error("locate_pattern"); } #ifdef TRE_WCHAR p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens); if (p->filter_match_index==NULL) { error("Cannot optimize filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_string_hash(p->tokens); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif if (allow_trace!=0) { open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace); } extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator); u_printf("Loading morphological dictionaries...\n"); load_morphological_dictionaries(morpho_dic_list,p,morpho_bin); extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator); p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char)); if (p->token_control==NULL) { fatal_alloc_error("locate_pattern"); } p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*)); if (p->matching_patterns==NULL) { fatal_alloc_error("locate_pattern"); } for (int i=0; i<n_text_tokens; i++) { p->token_control[i]=0; p->matching_patterns[i]=NULL; } compute_token_controls(p->alphabet,err,p); int number_of_patterns,is_DIC,is_CDIC,is_SDIC; p->pattern_tree_root=new_pattern_node(locate_abstract_allocator); u_printf("Computing fst2 tags...\n"); process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator); p->current_compound_pattern=number_of_patterns; p->DLC_tree=new_DLC_tree(p->tokens->size); struct lemma_node* root=new_lemma_node(); u_printf("Loading dlf...\n"); load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); u_printf("Loading dlc...\n"); load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); /* We look if tag tokens like "{today,.ADV}" verify some patterns */ check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator); u_printf("Optimizing fst2 pattern tags...\n"); optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator); u_printf("Optimizing compound word dictionary...\n"); optimize_DLC(p->DLC_tree); free_string_hash(semantic_codes); int nb_input_variable=0; p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable); p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables); Abstract_allocator locate_recycle_abstract_allocator=NULL; locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle", AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject, get_prefered_allocator_item_size_for_nb_variable(nb_input_variable)); u_printf("Optimizing fst2...\n"); p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator); if (is_korean) { p->korean=new Korean(p->alphabet); p->jamo_tags=create_jamo_tags(p->korean,p->tokens); } p->failfast=new_bit_array(n_text_tokens,ONE_BIT); u_printf("Working...\n"); p->prv_alloc=locate_work_abstract_allocator; p->prv_alloc_recycle=locate_recycle_abstract_allocator; launch_locate(out,text_size,info,p); if (allow_trace!=0) { close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace); } free_bit_array(p->failfast); free_Variables(p->input_variables); free_OutputVariables(p->output_variables); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); if (p->match_cache!=NULL) { for (int i=0; i<p->tokens->size; i++) { free_LocateCache(p->match_cache[i],locate_work_abstract_allocator); } free_cb(p->match_cache,locate_work_abstract_allocator); } int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1; if (free_abstract_allocator_item) { free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator); } free_stack_unichar(p->stack); /** Too long to free the DLC tree if it is big * free_DLC_tree(p->DLC_tree); */ if (free_abstract_allocator_item) { free_pattern_node(p->pattern_tree_root,locate_abstract_allocator); free_Fst2(p->fst2,locate_abstract_allocator); free_list_int(p->tag_token_list,locate_abstract_allocator); } close_abstract_allocator(locate_abstract_allocator); close_abstract_allocator(locate_recycle_abstract_allocator); locate_recycle_abstract_allocator=locate_abstract_allocator=NULL; /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */ free_alphabet(p->alphabet); if (p->korean!=NULL) { delete p->korean; } if (p->jamo_tags!=NULL) { /* jamo tags must be freed before tokens, because we need to know how * many jamo tags there are, and this number is the number of tokens */ for (int i=0; i<p->tokens->size; i++) { free(p->jamo_tags[i]); } free(p->jamo_tags); } free_string_hash(p->tokens); free_lemma_node(root); free(p->token_control); for (int i=0; i<n_text_tokens; i++) { free_bit_array(p->matching_patterns[i]); } free(p->matching_patterns); #ifdef TRE_WCHAR free_FilterSet(p->filters); free_FilterMatchIndex(p->filter_match_index); #endif for (int i=0; i<p->n_morpho_dics; i++) { free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i])); free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i])); } free(p->morpho_dic_inf); free(p->morpho_dic_inf_free); free(p->morpho_dic_bin); free(p->morpho_dic_bin_free); #if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT)) free_DLC_tree(p->DLC_tree); #endif free_locate_parameters(p); u_printf("Done.\n"); return 1; }
/** * This function minimizes the given automaton. Note * that it must be deterministic. For more information, * see comments in this library's .h file. */ void elag_minimize(SingleGraph automaton,int level) { struct list_int* initials=get_initial_states(automaton); if (initials==NULL) { /* No initial state should mean 'empty automaton' */ if (automaton->number_of_states!=0) { /* If not, we fail */ fatal_error("No initial state in non empty automaton in elag_minimize\n"); } return; } if (initials->next!=NULL) { fatal_error("Non-deterministic automaton in elag_minimize\n"); } free_list_int(initials); if (level>0) { /* If necessary, we remove transitions that are included in the * default ones */ compact_default_transitions(automaton); } SymbolAlphabet* alph=build_symbol_alphabet(automaton); TransitionCollection** transitions=build_transition_collections(automaton,alph); /* Now that we have numbered transitions, we don't need the symbol * alphabet anymore */ free_SymbolAlphabet(alph); int nbColors; int nbShades; int* color=(int*)calloc(automaton->number_of_states,sizeof(int)); if (color==NULL) { fatal_alloc_error("elag_minimize"); } int* shade=init_colors(automaton,&nbShades); do { int s; /* We copy the shades into the color array */ for (s=0;s<automaton->number_of_states;s++) { color[s]=shade[s]; } nbColors=nbShades; nbShades=0; /* We update the colors of the transitions' destination states */ update_colors(transitions,color,automaton->number_of_states); /* Now, for each state #s, we look for its shade, comparing it with * all the states #i so that i<s */ for (s=0;s<automaton->number_of_states;s++) { shade[s]=get_shade(s,transitions,color,shade,&nbShades); } /* We stop when no more shades have been introduced */ } while (nbColors!=nbShades); int* chosen=choose_states(color,nbColors,automaton->number_of_states); for (int i=0;i<automaton->number_of_states;i++) { free_TransitionCollection(transitions[i]); } free(transitions); free(shade); /* We allocate the resulting automaton */ SingleGraph result=new_SingleGraph(nbColors,PTR_TAGS); for (int c=0;c<nbColors;c++) { SingleGraphState state=add_state(result); SingleGraphState original=automaton->states[chosen[c]]; /* We set the initiality and finality of the state */ state->control=original->control; state->outgoing_transitions=original->outgoing_transitions; original->outgoing_transitions=NULL; /* We renumber the transitions' destination states */ for (Transition* t1=state->outgoing_transitions;t1!=NULL;t1=t1->next) { t1->state_number=color[t1->state_number]; } state->default_state=original->default_state; } /* Now we have to replace the old automaton by the new one */ move_SingleGraph(automaton,&result,free_symbol); /* And we don't need these arrays anymore */ free(color); free(chosen); }