Exemple #1
0
/**
 * This function takes a condition, i.e. a list of graph numbers. Then, it
 * tests if all the corresponding graphs match <E>. In that case, it sets
 * '*matches_E' to E_IS_MATCHED.
 */
struct list_int* resolve_simple_condition(struct list_int* c,Fst2State* states,
                                    int* initial_states,int *modification,
                                    int *matches_E) {
struct list_int* tmp;
*matches_E=E_IS_MATCHED;
if (c==NULL) return NULL;
/* First, we try to solve the rest of the condition */
tmp=resolve_simple_condition(c->next,states,initial_states,modification,matches_E);
if ((*matches_E)==E_IS_NOT_MATCHED) {
   /* If at least one element of the rest of the condition does not
    * matches <E>, we can delete the current element (the rest has
    * already been freed). */
   free(c);
   (*modification)++;
   return NULL;
}
if ((*matches_E)==E_IS_MATCHED) {
   /* If all the elements of the rest of the condition match <E> */
   c->next=tmp;
   if (is_bit_mask_set(states[initial_states[c->n]]->control,UNCONDITIONAL_E_MATCH)) {
      /* If the current one also matches <E>, then we can return */
      return c;
   }
   if (is_bit_mask_set(states[initial_states[c->n]]->control,CONDITIONAL_E_MATCH) && 
       !is_bit_mask_set(states[initial_states[c->n]]->control,DOES_NOT_MATCH_E)) {
      /* If we don't know if the current element matches <E> or not */
      *matches_E=DOES_NOT_KNOW_IF_E_IS_MATCHED;
      return c;
   }
   /* If the current element does not match <E>, we can free the rest of the condition */
   free_list_int(c);
   *matches_E=E_IS_NOT_MATCHED;
   (*modification)++;
   return NULL;
}
/* If we don't know if the rest of the condition match <E> */
c->next=tmp;
if (is_bit_mask_set(states[initial_states[c->n]]->control,UNCONDITIONAL_E_MATCH)) {
   /* If the current element matches <E>, we still cannot decide */
   *matches_E=DOES_NOT_KNOW_IF_E_IS_MATCHED;
   return c;
}
if (is_bit_mask_set(states[initial_states[c->n]]->control,DOES_NOT_MATCH_E)) {
   /* If the current element does not matches <E>, the condition is not verified */
   free_list_int(c);
   *matches_E=E_IS_NOT_MATCHED;
   (*modification)++;
   return NULL;
}
/* If we still don't know, we do nothing */
*matches_E=DOES_NOT_KNOW_IF_E_IS_MATCHED;
return c;
}
/**
 * Returns 1 if there is something more to do after this call or 0 if:
 * - no new information was found
 * - the main graph matches E
 */
static int resolve_all_conditions(GrfCheckInfo* chk,struct list_int* *list,int *unknown) {
*unknown=0;
struct list_int* new_list=NULL;
for (int i=1;i<chk->fst2->number_of_graphs+1;i++) {
	if (chk->graphs_matching_E[i]==CHK_DONT_KNOW) {
		/* We only need to look at the graphs we are not sure about yet */
		resolve_conditions(chk,i,*list);
		chk->graphs_matching_E[i]=get_status(chk->condition_graphs[i]);
		if (chk->graphs_matching_E[i]!=CHK_DONT_KNOW) {
			/* If we have found an answer, we note that graph #i must be
			 * looked at on the next loop */
			new_list=new_list_int(i,new_list);
		} else {
			/* The graph is still unknown */
			(*unknown)++;
		}
	}
}
/* Now we can use the new list */
free_list_int(*list);
*list=new_list;
if (chk->graphs_matching_E[1]==CHK_MATCHES_E) {
	error("Main graph matches epsilon!\n");
	return 0;
}
return ((*list)!=NULL && (*unknown)!=0);
}
/**
 * Frees all the memory associated to the given reg2grf_info structure.
 */
void free_reg2grf_info(struct reg2grf_info* info) {
if (info==NULL) return;
for (int i=0;i<info->n_states;i++) {
   if (info->states[i].content!=NULL) free(info->states[i].content);
   free_list_int(info->states[i].transitions);
}
free(info);
}
Exemple #4
0
/**
 * Frees the given tfst_match.
 */
void free_tfst_match(struct tfst_match* match) {
if (match==NULL) {
   fatal_error("NULL error in free_tfst_match");
}
/* We MUST NOT free 'fst2_transition' since it is just a copy of an actual transition
 * in the grammar */
free_list_int(match->text_tag_numbers);
free(match);
}
Exemple #5
0
/**
 * Frees all the memory associated to the given condition list.
 */
void free_ConditionList(ConditionList l) {
ConditionList tmp;
while (l!=NULL) {
   free_list_int(l->condition);
   tmp=l;
   l=l->next;
   free(tmp);
}
}
Exemple #6
0
int main()
{
	list_int* l = make_list_int();
	free_list_int(l);

	pair_vector_int_vector_float* s = make_pair_vector_int_vector_float(make_vector_int(5), make_vector_float(5));
	free_pair_vector_int_vector_float(s);

	vector_vector_vector_int* w = make_vector_vector_vector_int(5);
	free_vector_vector_vector_int(w);

	return 0;
}
/**
 * Frees all the dictionary graph whose root is 'a'.
 */
void free_dictionary_node(struct dictionary_node* a,Abstract_allocator prv_alloc) {
if (a==NULL) return;

if (get_allocator_cb_flag(prv_alloc) & AllocatorGetFlagAutoFreePresent) return;

if (a->incoming>1) {
	/* We don't free a state that is still pointed by someone else
	 * in order to avoid double freeing problems. */
   (a->incoming)--;
	return;
}
free_list_int(a->single_INF_code_list,prv_alloc);
free_dictionary_node_transition(a->trans,prv_alloc);
free_cb(a,prv_alloc);
}
/**
 * Decrement the counter usage of a pointer to a node 
 *  (a compound word tree whose root is 'node'.)
 * Frees the object from memory if counter became 0
 *
 * WARNING: this function tries to free both 'transitions' and 'destination_nodes',
 *          so, in order to avoid double freeing, the programmer must take care not
 *          to have a same node referenced in both 'transitions' and 'destination_nodes'.
 */
void decrement_reference_DLC_tree_node(struct DLC_tree_node* node) {
if (node==NULL) return;
node->count_reference--;
if (node->count_reference>0) return ;

free_list_int(node->patterns);
if (node->array_of_patterns!=NULL) free(node->array_of_patterns);
free_DLC_tree_transitions(node->transitions);
if (node->destination_tokens!=NULL) free(node->destination_tokens);
if (node->destination_nodes!=NULL) {
    for (int i=0;i<node->number_of_transitions;i++) {
				decrement_reference_DLC_tree_node(node->destination_nodes[i]);
	}
	free(node->destination_nodes);
}
free(node);
}
/**
 * This function takes a unicode string 'word' representing a compound word, and
 * tokenizes it into tokens. The output is an array 'tokens' that contains the
 * numbers of the tokens that constitute the word. If case variants are allowed,
 * a token can be replaced by a token list delimited by the special values
 * BEGIN_CASE_VARIANT_LIST and END_CASE_VARIANT_LIST. The token list is ended
 * by END_TOKEN_LIST.
 *
 * The array 'tokens' is supposed to be large enough. 'tok' represents the text tokens.
 * 'tokenization_mode' indicates if the word must be tokenized character by character
 * or not.
 */
void tokenize_compound_word(const unichar* word,int tokens[],const Alphabet* alphabet,
                            struct string_hash* tok,TokenizationPolicy tokenization_mode) {

int n_token,j;
struct list_ustring* list=tokenize(word,tokenization_mode,alphabet);
struct list_ustring* tmp;
struct list_int* ptr;
n_token=0;
while (list!=NULL) {
   j=get_value_index(list->string,tok,DONT_INSERT);
   /* If a token of a compound word is not a token of the text,
    * we MUST NOT ignore it. For instance, if we have the compound
    * word "a priori" and if the text only contains "PRIORI", it is not
    * an error case. The error case is when there is no case equivalent of
    * "priori" in the text. In such a situation, we traduce it by an empty
    * list. We don't raise an error because if there is by accident a token
    * in a dictionary that is not in the text, it would block the Locate
    * without necessity. */
   if (is_letter(list->string[0],alphabet) || j==-1) {
      /* If the current token is made of letters, we look for all
       * its case variants. If we have a non letter token that is
       * not in the text tokens, we handle it here to produce an
       * empty case variant list. */
      tokens[n_token++]=BEGIN_CASE_VARIANT_LIST;
      ptr=get_token_list_for_sequence(list->string,alphabet,tok);
      struct list_int* ptr_copy=ptr; // s.n.
      while (ptr!=NULL) {
         j=ptr->n;
         tokens[n_token++]=j;
         ptr=ptr->next;
      }
      free_list_int(ptr_copy); // s.n.
      tokens[n_token++]=END_CASE_VARIANT_LIST;
   } else {
      /* If we have a non letter single character, we just add its number to
       * the token array */
      tokens[n_token++]=j;
   }
   tmp=list;
   list=list->next;
   free_list_ustring_element(tmp);
}
/* Finally, we end the token list. */
tokens[n_token]=END_TOKEN_LIST;
}
Exemple #10
0
/**
 * This function concatenates B at the end of A. A is modified.
 */
void elag_concat(language_t* language,SingleGraph A,SingleGraph B) {
int oldnb=A->number_of_states;
int* renumber=(int*)malloc(B->number_of_states*sizeof(int));
if (renumber==NULL) {
   fatal_alloc_error("elag_concat");
}
int q;
/* We copy the states of B into A */
for (q=0;q<B->number_of_states;q++) {
   renumber[q]=A->number_of_states;
   add_state(A);
}
for (q=0;q<B->number_of_states;q++) {
   A->states[renumber[q]]->outgoing_transitions=clone_transition_list(B->states[q]->outgoing_transitions,renumber,dup_symbol);
   A->states[renumber[q]]->default_state=(B->states[q]->default_state!=-1)?renumber[B->states[q]->default_state]:-1;
   if (is_final_state(B->states[q])) {
      set_final_state(A->states[renumber[q]]);
   }
}
/* Then, we concatenate A and B.
 * 1) We replace default transitions that outgo from B's initial states
 *    by explicit transitions */
struct list_int* initials=get_initial_states(B);
for (struct list_int* tmp=initials;tmp!=NULL;tmp=tmp->next) {
   explicit_default_transition(language,A,renumber[tmp->n]);
}
for (q=0;q<oldnb;q++) {
   if (is_final_state(A->states[q])) {
      /* Each final state of A becomes non final. Moreover, we have
       * to explicit its default transition, because if not, the concatenation
       * algorithm will modify the recognized language. */
      unset_final_state(A->states[q]);
      explicit_default_transition(language,A,q);
      for (struct list_int* tmp=initials;tmp!=NULL;tmp=tmp->next) {
         concat(&(A->states[q]->outgoing_transitions),clone_transition_list(A->states[renumber[tmp->n]]->outgoing_transitions,NULL,dup_symbol));
         if (is_final_state(A->states[renumber[tmp->n]])) {
            set_final_state(A->states[q]);
         }
      }
   }
}
free(renumber);
free_list_int(initials);
}
Exemple #11
0
/**
 * This function loads a DLF or a DLC. It computes information about tokens
 * that will be used during the Locate operation. For instance, if we have the
 * following line:
 *
 *   extended,.A
 *
 * and if the .fst2 to be applied to the text contains the pattern <A> with,
 * number 456, then the function will mark the "extended" token to be matched
 * by the pattern 456. Moreover, all case variations will be taken into account,
 * so that the "Extended" and "EXTENDED" tokens will also be updated.
 *
 * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern'
 * indicate if the .fst2 contains the corresponding patterns. For instance, if
 * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a
 * compound word must be marked as be matched by this pattern.
 */
void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet,
                         int number_of_patterns,int is_DIC_pattern,
                         int is_CDIC_pattern,
                         struct lemma_node* root,struct locate_parameters* parameters) {
    struct string_hash* tokens=parameters->tokens;
    U_FILE* f;
    unichar line[DIC_LINE_SIZE];
    f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ);
    if (f==NULL) {
        error("Cannot open dictionary %s\n",dic_name);
        return;
    }
    /* We parse all the lines */
    int lines=0;
    char name[FILENAME_MAX];
    remove_path(dic_name,name);
    while (EOF!=u_fgets(line,f)) {
        lines++;
        if (lines%10000==0) {
            u_printf("%s: %d lines loaded...                          \r",name,lines);
        }
        if (line[0]=='/') {
            /* NOTE: DLF and DLC files are not supposed to contain comment
             *       lines, but we test them, just in the case */
            continue;
        }
        struct dela_entry* entry=tokenize_DELAF_line(line,1);
        if (entry==NULL) {
            /* This case should never happen */
            error("Invalid dictionary line in load_dic_for_locate\n");
            continue;
        }
        /* We add the inflected form to the list of forms associated to the lemma.
         * This will be used to replace patterns like "<be>" by the actual list of
         * forms that can be matched by it, for optimization reasons */
        add_inflected_form_for_lemma(entry->inflected,entry->lemma,root);
        /* We get the list of all tokens that can be matched by the inflected form of this
         * this entry, with regards to case variations (see the "extended" example above). */
        struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens);
        /* We save the list pointer to free it later */
        struct list_int* ptr_copy=ptr;
        /* Here, we will deal with all simple words */
        while (ptr!=NULL) {
            int i=ptr->n;
            /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */
            parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK);
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                if (list!=NULL) {
                    /* If we have some patterns to add */
                    if (parameters->matching_patterns[i]==NULL) {
                        /* We allocate the pattern bit array, if needed */
                        parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT);
                    }
                    struct list_pointer* tmp=list;
                    while (tmp!=NULL) {
                        /* Then we add all the pattern numbers to the bit array */
                        set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1);
                        tmp=tmp->next;
                    }
                    /* Finally, we free the constraint list */
                    free_list_pointer(list);
                }
            }
            ptr=ptr->next;
        }
        /* Finally, we free the token list */
        free_list_int(ptr_copy);
        if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) {
            /* If the inflected form is a compound word */
            if (is_DIC_pattern || is_CDIC_pattern) {
                /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we
                 * must note that all compound words can be matched by them */
                add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
            }
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                /* We look if the compound word can be matched by some patterns */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                struct list_pointer* tmp=list;
                while (tmp!=NULL) {
                    /* If the word is matched by at least one pattern, we store it. */
                    int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number;
                    add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
                    tmp=tmp->next;
                }
                free_list_pointer(list);
            }
        }
        free_dela_entry(entry);
    }
    if (lines>10000) {
        u_printf("\n");
    }
    u_fclose(f);
}
Exemple #12
0
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err,
                   const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy,
                   Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input,
                   const char* dynamicDir,TokenizationPolicy tokenization_policy,
                   SpacePolicy space_policy,int search_limit,const char* morpho_dic_list,
                   AmbiguousOutputPolicy ambiguous_output_policy,
                   VariableErrorPolicy variable_error_policy,int protect_dic_chars,
                   int is_korean,int max_count_call,int max_count_call_warning,
                   char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) {

    U_FILE* out;
    U_FILE* info;
    struct locate_parameters* p=new_locate_parameters();
    p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0);
    p->buffer=(int*)af_get_mapfile_pointer(p->text_cod);
    long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int);
    p->buffer_size=(int)text_size;
    p->tilde_negation_operator=tilde_negation_operator;
    p->useLocateCache=useLocateCache;
    if (max_count_call == -1) {
        max_count_call = (int)text_size;
    }
    if (max_count_call_warning == -1) {
        max_count_call_warning = (int)text_size;
    }
    p->match_policy=match_policy;
    p->tokenization_policy=tokenization_policy;
    p->space_policy=space_policy;
    p->output_policy=output_policy;
    p->search_limit=search_limit;
    p->ambiguous_output_policy=ambiguous_output_policy;
    p->variable_error_policy=variable_error_policy;
    p->protect_dic_chars=protect_dic_chars;
    p->mask_encoding_compatibility_input = mask_encoding_compatibility_input;
    p->max_count_call = max_count_call;
    p->max_count_call_warning = max_count_call_warning;
    p->token_filename = tokens;
    char concord[FILENAME_MAX];
    char concord_info[FILENAME_MAX];

    strcpy(concord,dynamicDir);
    strcat(concord,"concord.ind");

    strcpy(concord_info,dynamicDir);
    strcat(concord_info,"concord.n");

    char morpho_bin[FILENAME_MAX];
    strcpy(morpho_bin,dynamicDir);
    strcat(morpho_bin,"morpho.bin");
    if (arabic_rules!=NULL && arabic_rules[0]!='\0') {
        load_arabic_typo_rules(arabic_rules,&(p->arabic));
    }
    out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE);
    if (out==NULL) {
        error("Cannot write %s\n",concord);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        u_fclose(out);
        return 0;
    }
    info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE);
    if (info==NULL) {
        error("Cannot write %s\n",concord_info);
    }
    switch(output_policy) {
    case IGNORE_OUTPUTS:
        u_fprintf(out,"#I\n");
        break;
    case MERGE_OUTPUTS:
        u_fprintf(out,"#M\n");
        break;
    case REPLACE_OUTPUTS:
        u_fprintf(out,"#R\n");
        break;
    }
    if (alphabet!=NULL && alphabet[0]!='\0') {
        u_printf("Loading alphabet...\n");
        p->alphabet=load_alphabet(alphabet,is_korean);
        if (p->alphabet==NULL) {
            error("Cannot load alphabet file %s\n",alphabet);
            af_release_mapfile_pointer(p->text_cod,p->buffer);
            af_close_mapfile(p->text_cod);
            free_stack_unichar(p->stack);
            free_locate_parameters(p);
            if (info!=NULL) u_fclose(info);
            u_fclose(out);
            return 0;
        }
    }
    struct string_hash* semantic_codes=new_string_hash();
    extract_semantic_codes(dlf,semantic_codes);
    extract_semantic_codes(dlc,semantic_codes);

    if (is_cancelling_requested() != 0) {
        error("user cancel request.\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    u_printf("Loading fst2...\n");
    struct FST2_free_info fst2load_free;
    Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free);
    if (fst2load==NULL) {
        error("Cannot load grammar %s\n",fst2_name);
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered);


    p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator);
    free_abstract_Fst2(fst2load,&fst2load_free);

    if (is_cancelling_requested() != 0) {
        error("User cancel request..\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    p->tags=p->fst2->tags;
#ifdef TRE_WCHAR
    p->filters=new_FilterSet(p->fst2,p->alphabet);
    if (p->filters==NULL) {
        error("Cannot compile filter(s)\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
#endif
    u_printf("Loading token list...\n");
    int n_text_tokens=0;

    p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens);
    if (p->tokens==NULL) {
        error("Cannot load token list %s\n",tokens);
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
    Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator;

    p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator);
    memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache));
    if (p->match_cache==NULL) {
        fatal_alloc_error("locate_pattern");
    }

#ifdef TRE_WCHAR
    p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens);
    if (p->filter_match_index==NULL) {
        error("Cannot optimize filter(s)\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_string_hash(p->tokens);
        close_abstract_allocator(locate_abstract_allocator);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
#endif

    if (allow_trace!=0) {
        open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace);
    }
    extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator);
    u_printf("Loading morphological dictionaries...\n");
    load_morphological_dictionaries(morpho_dic_list,p,morpho_bin);
    extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator);
    p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char));
    if (p->token_control==NULL) {
        fatal_alloc_error("locate_pattern");
    }
    p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*));
    if (p->matching_patterns==NULL) {
        fatal_alloc_error("locate_pattern");
    }
    for (int i=0; i<n_text_tokens; i++) {
        p->token_control[i]=0;
        p->matching_patterns[i]=NULL;
    }
    compute_token_controls(p->alphabet,err,p);
    int number_of_patterns,is_DIC,is_CDIC,is_SDIC;
    p->pattern_tree_root=new_pattern_node(locate_abstract_allocator);
    u_printf("Computing fst2 tags...\n");
    process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator);
    p->current_compound_pattern=number_of_patterns;
    p->DLC_tree=new_DLC_tree(p->tokens->size);
    struct lemma_node* root=new_lemma_node();
    u_printf("Loading dlf...\n");
    load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p);
    u_printf("Loading dlc...\n");
    load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p);
    /* We look if tag tokens like "{today,.ADV}" verify some patterns */
    check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator);
    u_printf("Optimizing fst2 pattern tags...\n");
    optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator);
    u_printf("Optimizing compound word dictionary...\n");
    optimize_DLC(p->DLC_tree);
    free_string_hash(semantic_codes);
    int nb_input_variable=0;
    p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable);
    p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables);


    Abstract_allocator locate_recycle_abstract_allocator=NULL;
    locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle",
                                      AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject,
                                      get_prefered_allocator_item_size_for_nb_variable(nb_input_variable));

    u_printf("Optimizing fst2...\n");
    p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator);
    if (is_korean) {
        p->korean=new Korean(p->alphabet);
        p->jamo_tags=create_jamo_tags(p->korean,p->tokens);
    }
    p->failfast=new_bit_array(n_text_tokens,ONE_BIT);

    u_printf("Working...\n");
    p->prv_alloc=locate_work_abstract_allocator;
    p->prv_alloc_recycle=locate_recycle_abstract_allocator;
    launch_locate(out,text_size,info,p);
    if (allow_trace!=0) {
        close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace);
    }
    free_bit_array(p->failfast);
    free_Variables(p->input_variables);
    free_OutputVariables(p->output_variables);
    af_release_mapfile_pointer(p->text_cod,p->buffer);
    af_close_mapfile(p->text_cod);
    if (info!=NULL) u_fclose(info);
    u_fclose(out);

    if (p->match_cache!=NULL) {
        for (int i=0; i<p->tokens->size; i++) {
            free_LocateCache(p->match_cache[i],locate_work_abstract_allocator);
        }
        free_cb(p->match_cache,locate_work_abstract_allocator);
    }
    int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1;

    if (free_abstract_allocator_item) {
        free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator);
    }
    free_stack_unichar(p->stack);
    /** Too long to free the DLC tree if it is big
     * free_DLC_tree(p->DLC_tree);
     */
    if (free_abstract_allocator_item) {
        free_pattern_node(p->pattern_tree_root,locate_abstract_allocator);
        free_Fst2(p->fst2,locate_abstract_allocator);
        free_list_int(p->tag_token_list,locate_abstract_allocator);
    }
    close_abstract_allocator(locate_abstract_allocator);
    close_abstract_allocator(locate_recycle_abstract_allocator);
    locate_recycle_abstract_allocator=locate_abstract_allocator=NULL;

    /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */
    free_alphabet(p->alphabet);
    if (p->korean!=NULL) {
        delete p->korean;
    }
    if (p->jamo_tags!=NULL) {
        /* jamo tags must be freed before tokens, because we need to know how
         * many jamo tags there are, and this number is the number of tokens */
        for (int i=0; i<p->tokens->size; i++) {
            free(p->jamo_tags[i]);
        }
        free(p->jamo_tags);
    }
    free_string_hash(p->tokens);
    free_lemma_node(root);
    free(p->token_control);
    for (int i=0; i<n_text_tokens; i++) {
        free_bit_array(p->matching_patterns[i]);
    }
    free(p->matching_patterns);
#ifdef TRE_WCHAR
    free_FilterSet(p->filters);
    free_FilterMatchIndex(p->filter_match_index);
#endif
    for (int i=0; i<p->n_morpho_dics; i++) {
        free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i]));
        free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i]));
    }
    free(p->morpho_dic_inf);
    free(p->morpho_dic_inf_free);
    free(p->morpho_dic_bin);
    free(p->morpho_dic_bin_free);
#if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT))
    free_DLC_tree(p->DLC_tree);
#endif
    free_locate_parameters(p);
    u_printf("Done.\n");
    return 1;
}
Exemple #13
0
/**
 * This function minimizes the given automaton. Note
 * that it must be deterministic. For more information,
 * see comments in this library's .h file.
 */
void elag_minimize(SingleGraph automaton,int level) {
struct list_int* initials=get_initial_states(automaton);
if (initials==NULL) {
   /* No initial state should mean 'empty automaton' */
   if (automaton->number_of_states!=0) {
      /* If not, we fail */
      fatal_error("No initial state in non empty automaton in elag_minimize\n");
   }
   return;
}
if (initials->next!=NULL) {
   fatal_error("Non-deterministic automaton in elag_minimize\n");
}
free_list_int(initials);
if (level>0) {
   /* If necessary, we remove transitions that are included in the
    * default ones */
   compact_default_transitions(automaton);
}
SymbolAlphabet* alph=build_symbol_alphabet(automaton);
TransitionCollection** transitions=build_transition_collections(automaton,alph);
/* Now that we have numbered transitions, we don't need the symbol
 * alphabet anymore */
free_SymbolAlphabet(alph);
int nbColors;
int nbShades;
int* color=(int*)calloc(automaton->number_of_states,sizeof(int));
if (color==NULL) {
   fatal_alloc_error("elag_minimize");
}
int* shade=init_colors(automaton,&nbShades);
do {
   int s;
   /* We copy the shades into the color array */
   for (s=0;s<automaton->number_of_states;s++) {
      color[s]=shade[s];
   }
   nbColors=nbShades;
   nbShades=0;
   /* We update the colors of the transitions' destination states */
   update_colors(transitions,color,automaton->number_of_states);
   /* Now, for each state #s, we look for its shade, comparing it with
    * all the states #i so that i<s */
   for (s=0;s<automaton->number_of_states;s++) {
      shade[s]=get_shade(s,transitions,color,shade,&nbShades);
   }
   /* We stop when no more shades have been introduced */
} while (nbColors!=nbShades);
int* chosen=choose_states(color,nbColors,automaton->number_of_states);
for (int i=0;i<automaton->number_of_states;i++) {
   free_TransitionCollection(transitions[i]);
}
free(transitions);
free(shade);
/* We allocate the resulting automaton */
SingleGraph result=new_SingleGraph(nbColors,PTR_TAGS);
for (int c=0;c<nbColors;c++) {
   SingleGraphState state=add_state(result);
   SingleGraphState original=automaton->states[chosen[c]];
   /* We set the initiality and finality of the state */
   state->control=original->control;
   state->outgoing_transitions=original->outgoing_transitions;
   original->outgoing_transitions=NULL;
   /* We renumber the transitions' destination states */
   for (Transition* t1=state->outgoing_transitions;t1!=NULL;t1=t1->next) {
      t1->state_number=color[t1->state_number];
   }
   state->default_state=original->default_state;
}
/* Now we have to replace the old automaton by the new one */
move_SingleGraph(automaton,&result,free_symbol);
/* And we don't need these arrays anymore */
free(color);
free(chosen);
}
/**
 * Frees a whole int list.
 */
void free_list_int(struct list_int* head) {
free_list_int(head,STANDARD_ALLOCATOR);
}
/**
 * Returns 1 if the given .fst2 is OK to be used by the Locate program; 0 otherwise.
 * Conditions are:
 *
 * 1) no left recursion
 * 2) no loop that can recognize the empty word (<E> with an output or subgraph
 *    that can match the empty word).
 */
int OK_for_Locate_write_error(const VersatileEncodingConfig* vec,const char* name,char no_empty_graph_warning,U_FILE* ferr) {
int RESULT=1;
struct FST2_free_info fst2_free;
Fst2* fst2=load_abstract_fst2(vec,name,1,&fst2_free);
if (fst2==NULL) {
	fatal_error("Cannot load graph %s\n",name);
}
u_printf("Creating condition sets...\n");
GrfCheckInfo* chk=new_GrfCheckInfo(fst2);
/* Now, we look for a fix point in the condition graphs */
struct list_int* list=NULL;
/* To do that, we start by creating a list of all the graphs we are sure about */
int unknown=0;
for (int i=1;i<fst2->number_of_graphs+1;i++) {
	if (chk->graphs_matching_E[i]!=CHK_DONT_KNOW) {
		list=new_list_int(i,list);
	} else {
		unknown++;
	}
}
/* While there is something to do for E matching */
u_printf("Checking empty word matching...\n");
while (resolve_all_conditions(chk,&list,&unknown)) {}
if (chk->graphs_matching_E[1]==CHK_MATCHES_E) {
	if (!no_empty_graph_warning) {
       error("ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]);
       if (ferr!=NULL) {
    	   u_fprintf(ferr,"ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]);
	   }
	}
	goto evil_goto;
}
if (!no_empty_graph_warning) {
	for (int i=2;i<fst2->number_of_graphs+1;i++) {
		if (chk->graphs_matching_E[i]==CHK_MATCHES_E) {
			error("WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]);
			if (ferr!=NULL) {
				u_fprintf(ferr,"WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]);
			}
		}
	}
}
/* Now, we look for E loops and left recursions. And to do that, we need a new version
 * of the condition graphs, because a graph that does not match E would have been emptied.
 * And obviously, we can not deduce anything from an empty graph. */
rebuild_condition_graphs(chk);
u_printf("Checking E loops...\n");
if (is_any_E_loop(chk)) {
	/* Error messages have already been printed */
	goto evil_goto;
}
u_printf("Checking left recursions...\n");
if (is_any_left_recursion(chk)) {
	/* Error messages have already been printed */
	goto evil_goto;
}
evil_goto:
/* There may be something unused in the list that we need to free */
free_list_int(list);
free_GrfCheckInfo(chk);
free_abstract_Fst2(fst2,&fst2_free);
return RESULT;
}
/**
 * Returns the number of the state that is pointed by the $] transition that
 * closes the current context or -1 if not found.
 * Note that nested contexts are taken into account.
 */
static int get_end_of_context(Fst2* fst2,int state) {
struct list_int* visited_states=NULL;
int res=get_end_of_context__(fst2,state,&visited_states);
free_list_int(visited_states);
return res;
}