/** * This function takes a condition, i.e. a list of graph numbers. Then, it * tests if all the corresponding graphs match <E>. In that case, it sets * '*matches_E' to E_IS_MATCHED. */ struct list_int* resolve_simple_condition(struct list_int* c,Fst2State* states, int* initial_states,int *modification, int *matches_E) { struct list_int* tmp; *matches_E=E_IS_MATCHED; if (c==NULL) return NULL; /* First, we try to solve the rest of the condition */ tmp=resolve_simple_condition(c->next,states,initial_states,modification,matches_E); if ((*matches_E)==E_IS_NOT_MATCHED) { /* If at least one element of the rest of the condition does not * matches <E>, we can delete the current element (the rest has * already been freed). */ free(c); (*modification)++; return NULL; } if ((*matches_E)==E_IS_MATCHED) { /* If all the elements of the rest of the condition match <E> */ c->next=tmp; if (is_bit_mask_set(states[initial_states[c->n]]->control,UNCONDITIONAL_E_MATCH)) { /* If the current one also matches <E>, then we can return */ return c; } if (is_bit_mask_set(states[initial_states[c->n]]->control,CONDITIONAL_E_MATCH) && !is_bit_mask_set(states[initial_states[c->n]]->control,DOES_NOT_MATCH_E)) { /* If we don't know if the current element matches <E> or not */ *matches_E=DOES_NOT_KNOW_IF_E_IS_MATCHED; return c; } /* If the current element does not match <E>, we can free the rest of the condition */ free_list_int(c); *matches_E=E_IS_NOT_MATCHED; (*modification)++; return NULL; } /* If we don't know if the rest of the condition match <E> */ c->next=tmp; if (is_bit_mask_set(states[initial_states[c->n]]->control,UNCONDITIONAL_E_MATCH)) { /* If the current element matches <E>, we still cannot decide */ *matches_E=DOES_NOT_KNOW_IF_E_IS_MATCHED; return c; } if (is_bit_mask_set(states[initial_states[c->n]]->control,DOES_NOT_MATCH_E)) { /* If the current element does not matches <E>, the condition is not verified */ free_list_int(c); *matches_E=E_IS_NOT_MATCHED; (*modification)++; return NULL; } /* If we still don't know, we do nothing */ *matches_E=DOES_NOT_KNOW_IF_E_IS_MATCHED; return c; }
/** * Returns 1 if there is something more to do after this call or 0 if: * - no new information was found * - the main graph matches E */ static int resolve_all_conditions(GrfCheckInfo* chk,struct list_int* *list,int *unknown) { *unknown=0; struct list_int* new_list=NULL; for (int i=1;i<chk->fst2->number_of_graphs+1;i++) { if (chk->graphs_matching_E[i]==CHK_DONT_KNOW) { /* We only need to look at the graphs we are not sure about yet */ resolve_conditions(chk,i,*list); chk->graphs_matching_E[i]=get_status(chk->condition_graphs[i]); if (chk->graphs_matching_E[i]!=CHK_DONT_KNOW) { /* If we have found an answer, we note that graph #i must be * looked at on the next loop */ new_list=new_list_int(i,new_list); } else { /* The graph is still unknown */ (*unknown)++; } } } /* Now we can use the new list */ free_list_int(*list); *list=new_list; if (chk->graphs_matching_E[1]==CHK_MATCHES_E) { error("Main graph matches epsilon!\n"); return 0; } return ((*list)!=NULL && (*unknown)!=0); }
/** * Frees all the memory associated to the given reg2grf_info structure. */ void free_reg2grf_info(struct reg2grf_info* info) { if (info==NULL) return; for (int i=0;i<info->n_states;i++) { if (info->states[i].content!=NULL) free(info->states[i].content); free_list_int(info->states[i].transitions); } free(info); }
/** * Frees the given tfst_match. */ void free_tfst_match(struct tfst_match* match) { if (match==NULL) { fatal_error("NULL error in free_tfst_match"); } /* We MUST NOT free 'fst2_transition' since it is just a copy of an actual transition * in the grammar */ free_list_int(match->text_tag_numbers); free(match); }
/** * Frees all the memory associated to the given condition list. */ void free_ConditionList(ConditionList l) { ConditionList tmp; while (l!=NULL) { free_list_int(l->condition); tmp=l; l=l->next; free(tmp); } }
int main() { list_int* l = make_list_int(); free_list_int(l); pair_vector_int_vector_float* s = make_pair_vector_int_vector_float(make_vector_int(5), make_vector_float(5)); free_pair_vector_int_vector_float(s); vector_vector_vector_int* w = make_vector_vector_vector_int(5); free_vector_vector_vector_int(w); return 0; }
/** * Frees all the dictionary graph whose root is 'a'. */ void free_dictionary_node(struct dictionary_node* a,Abstract_allocator prv_alloc) { if (a==NULL) return; if (get_allocator_cb_flag(prv_alloc) & AllocatorGetFlagAutoFreePresent) return; if (a->incoming>1) { /* We don't free a state that is still pointed by someone else * in order to avoid double freeing problems. */ (a->incoming)--; return; } free_list_int(a->single_INF_code_list,prv_alloc); free_dictionary_node_transition(a->trans,prv_alloc); free_cb(a,prv_alloc); }
/** * Decrement the counter usage of a pointer to a node * (a compound word tree whose root is 'node'.) * Frees the object from memory if counter became 0 * * WARNING: this function tries to free both 'transitions' and 'destination_nodes', * so, in order to avoid double freeing, the programmer must take care not * to have a same node referenced in both 'transitions' and 'destination_nodes'. */ void decrement_reference_DLC_tree_node(struct DLC_tree_node* node) { if (node==NULL) return; node->count_reference--; if (node->count_reference>0) return ; free_list_int(node->patterns); if (node->array_of_patterns!=NULL) free(node->array_of_patterns); free_DLC_tree_transitions(node->transitions); if (node->destination_tokens!=NULL) free(node->destination_tokens); if (node->destination_nodes!=NULL) { for (int i=0;i<node->number_of_transitions;i++) { decrement_reference_DLC_tree_node(node->destination_nodes[i]); } free(node->destination_nodes); } free(node); }
/** * This function takes a unicode string 'word' representing a compound word, and * tokenizes it into tokens. The output is an array 'tokens' that contains the * numbers of the tokens that constitute the word. If case variants are allowed, * a token can be replaced by a token list delimited by the special values * BEGIN_CASE_VARIANT_LIST and END_CASE_VARIANT_LIST. The token list is ended * by END_TOKEN_LIST. * * The array 'tokens' is supposed to be large enough. 'tok' represents the text tokens. * 'tokenization_mode' indicates if the word must be tokenized character by character * or not. */ void tokenize_compound_word(const unichar* word,int tokens[],const Alphabet* alphabet, struct string_hash* tok,TokenizationPolicy tokenization_mode) { int n_token,j; struct list_ustring* list=tokenize(word,tokenization_mode,alphabet); struct list_ustring* tmp; struct list_int* ptr; n_token=0; while (list!=NULL) { j=get_value_index(list->string,tok,DONT_INSERT); /* If a token of a compound word is not a token of the text, * we MUST NOT ignore it. For instance, if we have the compound * word "a priori" and if the text only contains "PRIORI", it is not * an error case. The error case is when there is no case equivalent of * "priori" in the text. In such a situation, we traduce it by an empty * list. We don't raise an error because if there is by accident a token * in a dictionary that is not in the text, it would block the Locate * without necessity. */ if (is_letter(list->string[0],alphabet) || j==-1) { /* If the current token is made of letters, we look for all * its case variants. If we have a non letter token that is * not in the text tokens, we handle it here to produce an * empty case variant list. */ tokens[n_token++]=BEGIN_CASE_VARIANT_LIST; ptr=get_token_list_for_sequence(list->string,alphabet,tok); struct list_int* ptr_copy=ptr; // s.n. while (ptr!=NULL) { j=ptr->n; tokens[n_token++]=j; ptr=ptr->next; } free_list_int(ptr_copy); // s.n. tokens[n_token++]=END_CASE_VARIANT_LIST; } else { /* If we have a non letter single character, we just add its number to * the token array */ tokens[n_token++]=j; } tmp=list; list=list->next; free_list_ustring_element(tmp); } /* Finally, we end the token list. */ tokens[n_token]=END_TOKEN_LIST; }
/** * This function concatenates B at the end of A. A is modified. */ void elag_concat(language_t* language,SingleGraph A,SingleGraph B) { int oldnb=A->number_of_states; int* renumber=(int*)malloc(B->number_of_states*sizeof(int)); if (renumber==NULL) { fatal_alloc_error("elag_concat"); } int q; /* We copy the states of B into A */ for (q=0;q<B->number_of_states;q++) { renumber[q]=A->number_of_states; add_state(A); } for (q=0;q<B->number_of_states;q++) { A->states[renumber[q]]->outgoing_transitions=clone_transition_list(B->states[q]->outgoing_transitions,renumber,dup_symbol); A->states[renumber[q]]->default_state=(B->states[q]->default_state!=-1)?renumber[B->states[q]->default_state]:-1; if (is_final_state(B->states[q])) { set_final_state(A->states[renumber[q]]); } } /* Then, we concatenate A and B. * 1) We replace default transitions that outgo from B's initial states * by explicit transitions */ struct list_int* initials=get_initial_states(B); for (struct list_int* tmp=initials;tmp!=NULL;tmp=tmp->next) { explicit_default_transition(language,A,renumber[tmp->n]); } for (q=0;q<oldnb;q++) { if (is_final_state(A->states[q])) { /* Each final state of A becomes non final. Moreover, we have * to explicit its default transition, because if not, the concatenation * algorithm will modify the recognized language. */ unset_final_state(A->states[q]); explicit_default_transition(language,A,q); for (struct list_int* tmp=initials;tmp!=NULL;tmp=tmp->next) { concat(&(A->states[q]->outgoing_transitions),clone_transition_list(A->states[renumber[tmp->n]]->outgoing_transitions,NULL,dup_symbol)); if (is_final_state(A->states[renumber[tmp->n]])) { set_final_state(A->states[q]); } } } } free(renumber); free_list_int(initials); }
/** * This function loads a DLF or a DLC. It computes information about tokens * that will be used during the Locate operation. For instance, if we have the * following line: * * extended,.A * * and if the .fst2 to be applied to the text contains the pattern <A> with, * number 456, then the function will mark the "extended" token to be matched * by the pattern 456. Moreover, all case variations will be taken into account, * so that the "Extended" and "EXTENDED" tokens will also be updated. * * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern' * indicate if the .fst2 contains the corresponding patterns. For instance, if * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a * compound word must be marked as be matched by this pattern. */ void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet, int number_of_patterns,int is_DIC_pattern, int is_CDIC_pattern, struct lemma_node* root,struct locate_parameters* parameters) { struct string_hash* tokens=parameters->tokens; U_FILE* f; unichar line[DIC_LINE_SIZE]; f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ); if (f==NULL) { error("Cannot open dictionary %s\n",dic_name); return; } /* We parse all the lines */ int lines=0; char name[FILENAME_MAX]; remove_path(dic_name,name); while (EOF!=u_fgets(line,f)) { lines++; if (lines%10000==0) { u_printf("%s: %d lines loaded... \r",name,lines); } if (line[0]=='/') { /* NOTE: DLF and DLC files are not supposed to contain comment * lines, but we test them, just in the case */ continue; } struct dela_entry* entry=tokenize_DELAF_line(line,1); if (entry==NULL) { /* This case should never happen */ error("Invalid dictionary line in load_dic_for_locate\n"); continue; } /* We add the inflected form to the list of forms associated to the lemma. * This will be used to replace patterns like "<be>" by the actual list of * forms that can be matched by it, for optimization reasons */ add_inflected_form_for_lemma(entry->inflected,entry->lemma,root); /* We get the list of all tokens that can be matched by the inflected form of this * this entry, with regards to case variations (see the "extended" example above). */ struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens); /* We save the list pointer to free it later */ struct list_int* ptr_copy=ptr; /* Here, we will deal with all simple words */ while (ptr!=NULL) { int i=ptr->n; /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */ parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK); if (number_of_patterns) { /* We look for matching patterns only if there are some */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); if (list!=NULL) { /* If we have some patterns to add */ if (parameters->matching_patterns[i]==NULL) { /* We allocate the pattern bit array, if needed */ parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT); } struct list_pointer* tmp=list; while (tmp!=NULL) { /* Then we add all the pattern numbers to the bit array */ set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1); tmp=tmp->next; } /* Finally, we free the constraint list */ free_list_pointer(list); } } ptr=ptr->next; } /* Finally, we free the token list */ free_list_int(ptr_copy); if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) { /* If the inflected form is a compound word */ if (is_DIC_pattern || is_CDIC_pattern) { /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we * must note that all compound words can be matched by them */ add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); } if (number_of_patterns) { /* We look for matching patterns only if there are some */ /* We look if the compound word can be matched by some patterns */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); struct list_pointer* tmp=list; while (tmp!=NULL) { /* If the word is matched by at least one pattern, we store it. */ int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number; add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); tmp=tmp->next; } free_list_pointer(list); } } free_dela_entry(entry); } if (lines>10000) { u_printf("\n"); } u_fclose(f); }
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err, const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy, Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input, const char* dynamicDir,TokenizationPolicy tokenization_policy, SpacePolicy space_policy,int search_limit,const char* morpho_dic_list, AmbiguousOutputPolicy ambiguous_output_policy, VariableErrorPolicy variable_error_policy,int protect_dic_chars, int is_korean,int max_count_call,int max_count_call_warning, char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) { U_FILE* out; U_FILE* info; struct locate_parameters* p=new_locate_parameters(); p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0); p->buffer=(int*)af_get_mapfile_pointer(p->text_cod); long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int); p->buffer_size=(int)text_size; p->tilde_negation_operator=tilde_negation_operator; p->useLocateCache=useLocateCache; if (max_count_call == -1) { max_count_call = (int)text_size; } if (max_count_call_warning == -1) { max_count_call_warning = (int)text_size; } p->match_policy=match_policy; p->tokenization_policy=tokenization_policy; p->space_policy=space_policy; p->output_policy=output_policy; p->search_limit=search_limit; p->ambiguous_output_policy=ambiguous_output_policy; p->variable_error_policy=variable_error_policy; p->protect_dic_chars=protect_dic_chars; p->mask_encoding_compatibility_input = mask_encoding_compatibility_input; p->max_count_call = max_count_call; p->max_count_call_warning = max_count_call_warning; p->token_filename = tokens; char concord[FILENAME_MAX]; char concord_info[FILENAME_MAX]; strcpy(concord,dynamicDir); strcat(concord,"concord.ind"); strcpy(concord_info,dynamicDir); strcat(concord_info,"concord.n"); char morpho_bin[FILENAME_MAX]; strcpy(morpho_bin,dynamicDir); strcat(morpho_bin,"morpho.bin"); if (arabic_rules!=NULL && arabic_rules[0]!='\0') { load_arabic_typo_rules(arabic_rules,&(p->arabic)); } out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE); if (out==NULL) { error("Cannot write %s\n",concord); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); u_fclose(out); return 0; } info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE); if (info==NULL) { error("Cannot write %s\n",concord_info); } switch(output_policy) { case IGNORE_OUTPUTS: u_fprintf(out,"#I\n"); break; case MERGE_OUTPUTS: u_fprintf(out,"#M\n"); break; case REPLACE_OUTPUTS: u_fprintf(out,"#R\n"); break; } if (alphabet!=NULL && alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); p->alphabet=load_alphabet(alphabet,is_korean); if (p->alphabet==NULL) { error("Cannot load alphabet file %s\n",alphabet); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } } struct string_hash* semantic_codes=new_string_hash(); extract_semantic_codes(dlf,semantic_codes); extract_semantic_codes(dlc,semantic_codes); if (is_cancelling_requested() != 0) { error("user cancel request.\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } u_printf("Loading fst2...\n"); struct FST2_free_info fst2load_free; Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free); if (fst2load==NULL) { error("Cannot load grammar %s\n",fst2_name); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered); p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator); free_abstract_Fst2(fst2load,&fst2load_free); if (is_cancelling_requested() != 0) { error("User cancel request..\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } p->tags=p->fst2->tags; #ifdef TRE_WCHAR p->filters=new_FilterSet(p->fst2,p->alphabet); if (p->filters==NULL) { error("Cannot compile filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_stack_unichar(p->stack); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif u_printf("Loading token list...\n"); int n_text_tokens=0; p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens); if (p->tokens==NULL) { error("Cannot load token list %s\n",tokens); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator; p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator); memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache)); if (p->match_cache==NULL) { fatal_alloc_error("locate_pattern"); } #ifdef TRE_WCHAR p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens); if (p->filter_match_index==NULL) { error("Cannot optimize filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_string_hash(p->tokens); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif if (allow_trace!=0) { open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace); } extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator); u_printf("Loading morphological dictionaries...\n"); load_morphological_dictionaries(morpho_dic_list,p,morpho_bin); extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator); p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char)); if (p->token_control==NULL) { fatal_alloc_error("locate_pattern"); } p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*)); if (p->matching_patterns==NULL) { fatal_alloc_error("locate_pattern"); } for (int i=0; i<n_text_tokens; i++) { p->token_control[i]=0; p->matching_patterns[i]=NULL; } compute_token_controls(p->alphabet,err,p); int number_of_patterns,is_DIC,is_CDIC,is_SDIC; p->pattern_tree_root=new_pattern_node(locate_abstract_allocator); u_printf("Computing fst2 tags...\n"); process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator); p->current_compound_pattern=number_of_patterns; p->DLC_tree=new_DLC_tree(p->tokens->size); struct lemma_node* root=new_lemma_node(); u_printf("Loading dlf...\n"); load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); u_printf("Loading dlc...\n"); load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); /* We look if tag tokens like "{today,.ADV}" verify some patterns */ check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator); u_printf("Optimizing fst2 pattern tags...\n"); optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator); u_printf("Optimizing compound word dictionary...\n"); optimize_DLC(p->DLC_tree); free_string_hash(semantic_codes); int nb_input_variable=0; p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable); p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables); Abstract_allocator locate_recycle_abstract_allocator=NULL; locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle", AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject, get_prefered_allocator_item_size_for_nb_variable(nb_input_variable)); u_printf("Optimizing fst2...\n"); p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator); if (is_korean) { p->korean=new Korean(p->alphabet); p->jamo_tags=create_jamo_tags(p->korean,p->tokens); } p->failfast=new_bit_array(n_text_tokens,ONE_BIT); u_printf("Working...\n"); p->prv_alloc=locate_work_abstract_allocator; p->prv_alloc_recycle=locate_recycle_abstract_allocator; launch_locate(out,text_size,info,p); if (allow_trace!=0) { close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace); } free_bit_array(p->failfast); free_Variables(p->input_variables); free_OutputVariables(p->output_variables); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); if (p->match_cache!=NULL) { for (int i=0; i<p->tokens->size; i++) { free_LocateCache(p->match_cache[i],locate_work_abstract_allocator); } free_cb(p->match_cache,locate_work_abstract_allocator); } int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1; if (free_abstract_allocator_item) { free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator); } free_stack_unichar(p->stack); /** Too long to free the DLC tree if it is big * free_DLC_tree(p->DLC_tree); */ if (free_abstract_allocator_item) { free_pattern_node(p->pattern_tree_root,locate_abstract_allocator); free_Fst2(p->fst2,locate_abstract_allocator); free_list_int(p->tag_token_list,locate_abstract_allocator); } close_abstract_allocator(locate_abstract_allocator); close_abstract_allocator(locate_recycle_abstract_allocator); locate_recycle_abstract_allocator=locate_abstract_allocator=NULL; /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */ free_alphabet(p->alphabet); if (p->korean!=NULL) { delete p->korean; } if (p->jamo_tags!=NULL) { /* jamo tags must be freed before tokens, because we need to know how * many jamo tags there are, and this number is the number of tokens */ for (int i=0; i<p->tokens->size; i++) { free(p->jamo_tags[i]); } free(p->jamo_tags); } free_string_hash(p->tokens); free_lemma_node(root); free(p->token_control); for (int i=0; i<n_text_tokens; i++) { free_bit_array(p->matching_patterns[i]); } free(p->matching_patterns); #ifdef TRE_WCHAR free_FilterSet(p->filters); free_FilterMatchIndex(p->filter_match_index); #endif for (int i=0; i<p->n_morpho_dics; i++) { free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i])); free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i])); } free(p->morpho_dic_inf); free(p->morpho_dic_inf_free); free(p->morpho_dic_bin); free(p->morpho_dic_bin_free); #if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT)) free_DLC_tree(p->DLC_tree); #endif free_locate_parameters(p); u_printf("Done.\n"); return 1; }
/** * This function minimizes the given automaton. Note * that it must be deterministic. For more information, * see comments in this library's .h file. */ void elag_minimize(SingleGraph automaton,int level) { struct list_int* initials=get_initial_states(automaton); if (initials==NULL) { /* No initial state should mean 'empty automaton' */ if (automaton->number_of_states!=0) { /* If not, we fail */ fatal_error("No initial state in non empty automaton in elag_minimize\n"); } return; } if (initials->next!=NULL) { fatal_error("Non-deterministic automaton in elag_minimize\n"); } free_list_int(initials); if (level>0) { /* If necessary, we remove transitions that are included in the * default ones */ compact_default_transitions(automaton); } SymbolAlphabet* alph=build_symbol_alphabet(automaton); TransitionCollection** transitions=build_transition_collections(automaton,alph); /* Now that we have numbered transitions, we don't need the symbol * alphabet anymore */ free_SymbolAlphabet(alph); int nbColors; int nbShades; int* color=(int*)calloc(automaton->number_of_states,sizeof(int)); if (color==NULL) { fatal_alloc_error("elag_minimize"); } int* shade=init_colors(automaton,&nbShades); do { int s; /* We copy the shades into the color array */ for (s=0;s<automaton->number_of_states;s++) { color[s]=shade[s]; } nbColors=nbShades; nbShades=0; /* We update the colors of the transitions' destination states */ update_colors(transitions,color,automaton->number_of_states); /* Now, for each state #s, we look for its shade, comparing it with * all the states #i so that i<s */ for (s=0;s<automaton->number_of_states;s++) { shade[s]=get_shade(s,transitions,color,shade,&nbShades); } /* We stop when no more shades have been introduced */ } while (nbColors!=nbShades); int* chosen=choose_states(color,nbColors,automaton->number_of_states); for (int i=0;i<automaton->number_of_states;i++) { free_TransitionCollection(transitions[i]); } free(transitions); free(shade); /* We allocate the resulting automaton */ SingleGraph result=new_SingleGraph(nbColors,PTR_TAGS); for (int c=0;c<nbColors;c++) { SingleGraphState state=add_state(result); SingleGraphState original=automaton->states[chosen[c]]; /* We set the initiality and finality of the state */ state->control=original->control; state->outgoing_transitions=original->outgoing_transitions; original->outgoing_transitions=NULL; /* We renumber the transitions' destination states */ for (Transition* t1=state->outgoing_transitions;t1!=NULL;t1=t1->next) { t1->state_number=color[t1->state_number]; } state->default_state=original->default_state; } /* Now we have to replace the old automaton by the new one */ move_SingleGraph(automaton,&result,free_symbol); /* And we don't need these arrays anymore */ free(color); free(chosen); }
/** * Frees a whole int list. */ void free_list_int(struct list_int* head) { free_list_int(head,STANDARD_ALLOCATOR); }
/** * Returns 1 if the given .fst2 is OK to be used by the Locate program; 0 otherwise. * Conditions are: * * 1) no left recursion * 2) no loop that can recognize the empty word (<E> with an output or subgraph * that can match the empty word). */ int OK_for_Locate_write_error(const VersatileEncodingConfig* vec,const char* name,char no_empty_graph_warning,U_FILE* ferr) { int RESULT=1; struct FST2_free_info fst2_free; Fst2* fst2=load_abstract_fst2(vec,name,1,&fst2_free); if (fst2==NULL) { fatal_error("Cannot load graph %s\n",name); } u_printf("Creating condition sets...\n"); GrfCheckInfo* chk=new_GrfCheckInfo(fst2); /* Now, we look for a fix point in the condition graphs */ struct list_int* list=NULL; /* To do that, we start by creating a list of all the graphs we are sure about */ int unknown=0; for (int i=1;i<fst2->number_of_graphs+1;i++) { if (chk->graphs_matching_E[i]!=CHK_DONT_KNOW) { list=new_list_int(i,list); } else { unknown++; } } /* While there is something to do for E matching */ u_printf("Checking empty word matching...\n"); while (resolve_all_conditions(chk,&list,&unknown)) {} if (chk->graphs_matching_E[1]==CHK_MATCHES_E) { if (!no_empty_graph_warning) { error("ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]); if (ferr!=NULL) { u_fprintf(ferr,"ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]); } } goto evil_goto; } if (!no_empty_graph_warning) { for (int i=2;i<fst2->number_of_graphs+1;i++) { if (chk->graphs_matching_E[i]==CHK_MATCHES_E) { error("WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]); if (ferr!=NULL) { u_fprintf(ferr,"WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]); } } } } /* Now, we look for E loops and left recursions. And to do that, we need a new version * of the condition graphs, because a graph that does not match E would have been emptied. * And obviously, we can not deduce anything from an empty graph. */ rebuild_condition_graphs(chk); u_printf("Checking E loops...\n"); if (is_any_E_loop(chk)) { /* Error messages have already been printed */ goto evil_goto; } u_printf("Checking left recursions...\n"); if (is_any_left_recursion(chk)) { /* Error messages have already been printed */ goto evil_goto; } evil_goto: /* There may be something unused in the list that we need to free */ free_list_int(list); free_GrfCheckInfo(chk); free_abstract_Fst2(fst2,&fst2_free); return RESULT; }
/** * Returns the number of the state that is pointed by the $] transition that * closes the current context or -1 if not found. * Note that nested contexts are taken into account. */ static int get_end_of_context(Fst2* fst2,int state) { struct list_int* visited_states=NULL; int res=get_end_of_context__(fst2,state,&visited_states); free_list_int(visited_states); return res; }