/** * Looks for a loop. To do that, we only follow transitions that can match E. * Every time we follow such a transition, we add it to the 'transitions' list. * This list is used to print the E loop if we find any. The function returns * 1 if a loop is found; 0 otherwise. */ static int find_an_E_loop(int* mark,int current_state,int graph,GrfCheckInfo* chk, struct list_pointer* transitions) { if (mark[current_state]==1) { /* The state has been visited, nothing to do */ return 0; } if (mark[current_state]==2) { /* The state is being visited, we have a loop */ error("E loop in graph %S, made of the following tags:\n",chk->fst2->graph_names[graph]); print_reversed_list(transitions,chk->fst2,current_state,0); return 1; } /* We start visiting the state */ mark[current_state]=2; SingleGraphState s=chk->condition_graphs[graph]->states[current_state]; Transition* t=s->outgoing_transitions; while (t!=NULL) { if (transition_can_match_E(t->tag_number,chk)) { struct list_pointer* new_head=new_list_pointer(t,transitions); int res=find_an_E_loop(mark,t->state_number,graph,chk,new_head); new_head->next=NULL; free_list_pointer(new_head); if (res==1) { return 1; } } t=t->next; } /* The state has been fully visited */ mark[current_state]=1; return 0; }
static int find_a_left_recursion(int* mark_graph,int* mark_state,int current_state,int graph, GrfCheckInfo* chk,struct list_pointer* transitions) { if (mark_state[current_state]==1) { /* The state has been visited, nothing to do */ return 0; } if (mark_state[current_state]==2) { /* The state is being visited, we have a loop, but it should have been detected before */ error("E loop in graph %S, made of the following tags:\n",chk->fst2->graph_names[graph]); print_reversed_list(transitions,chk->fst2,current_state,0); return 1; } /* We start visiting the state */ mark_state[current_state]=2; SingleGraphState s=chk->condition_graphs[graph]->states[current_state]; Transition* t=s->outgoing_transitions; while (t!=NULL) { if (t->tag_number<0) { /* As we look for left recursions, we always test recursively * graph calls, regardless the fact that they may match E */ struct list_pointer* new_head=new_list_pointer(t,transitions); int res=is_left_recursion(chk,-(t->tag_number),mark_graph,new_head); new_head->next=NULL; free_list_pointer(new_head); if (res==1) { return 1; } } if (transition_can_match_E(t->tag_number,chk)) { struct list_pointer* new_head=new_list_pointer(t,transitions); int res=find_a_left_recursion(mark_graph,mark_state,t->state_number,graph,chk,new_head); new_head->next=NULL; free_list_pointer(new_head); if (res==1) { return 1; } } t=t->next; } /* The state has been fully visited */ mark_state[current_state]=1; return 0; }
/** * This function explores the partial matches that constitute the given match in order to produce * one or all possible outputs, depending on infos->ambiguous_output_policy. * The output(s) is(are) then used to add matches to the infos->matches list. */ void explore_match_to_get_outputs(struct locate_tfst_infos* infos,struct tfst_match* m, struct tfst_simple_match_list* element) { /* As m is a reversed list, we first need to get its elements in the right order */ vector_ptr* items=new_vector_ptr(16); fill_vector(items,m); Ustring* s=new_Ustring(1024); /* In MERGE/REPLACE mode, we have to explore the combination of partial matches */ struct list_pointer* ptr=NULL; explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,0,s,-1,&ptr); free_list_pointer(ptr); free_Ustring(s); free_vector_ptr(items); }
/** * This function checks for each tag token like "{extended,extend.V:K}" * if it verifies some patterns. Its behaviour is very similar to the one * of the load_dic_for_locate function. However, as a side effect, this * function fills 'tag_token_list' with the list of tag token numbers. * This list is later used during Locate preprocessings. */ void check_patterns_for_tag_tokens(Alphabet* alphabet,int number_of_patterns, struct lemma_node* root,struct locate_parameters* parameters,Abstract_allocator prv_alloc) { struct string_hash* tokens=parameters->tokens; for (int i=0; i<tokens->size; i++) { if (tokens->value[i][0]=='{' && u_strcmp(tokens->value[i],"{S}") && u_strcmp(tokens->value[i],"{STOP}")) { /* If the token is tag like "{today,.ADV}", we add its number to the tag token list */ parameters->tag_token_list=head_insert(i,parameters->tag_token_list,prv_alloc); /* And we look for the patterns that can match it */ struct dela_entry* entry=tokenize_tag_token(tokens->value[i]); if (entry==NULL) { /* This should never happen */ fatal_error("Invalid tag token in function check_patterns_for_tag_tokens\n"); } /* We add the inflected form to the list of forms associated to the lemma. * This will be used to replace patterns like "<be>" by the actual list of * forms that can be matched by it, for optimization reasons */ add_inflected_form_for_lemma(tokens->value[i],entry->lemma,root); parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK); if (number_of_patterns) { /* We look for matching patterns only if there are some */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); if (list!=NULL) { if (parameters->matching_patterns[i]==NULL) { /* We allocate the bit array if needed */ parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT); } struct list_pointer* tmp=list; while (tmp!=NULL) { set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1); tmp=tmp->next; } free_list_pointer(list); } } /* At the opposite of DLC lines, a compound word tag like "{all around,.ADV}" * does not need to be put in the compound word tree, since the tag is already * characterized by its token number. */ free_dela_entry(entry); } } }
/** * Explores all the partial matches to produce outputs in MERGE or REPLACE mode. * * If *var_starts!=NULL, it means that there are pending $var_start( tags * that wait for being taken into account when a text dependent tag is found. */ void explore_match_for_MERGE_or_REPLACE_mode(struct locate_tfst_infos* infos, struct tfst_simple_match_list* element, vector_ptr* items,int current_item,Ustring* s, int last_text_dependent_tfst_tag, struct list_pointer* *var_starts) { if (current_item==items->nbelems) { /* If we have finished, we can save the current output */ element->output=s->str; infos->matches=add_element_to_list(infos,infos->matches,element); element->output=NULL; return; } /* We save the length because it will be modified */ int len=s->len; struct tfst_match* item=(struct tfst_match*)(items->tab[current_item]); if (item==NULL) { fatal_error("Unexpected NULL item in explore_match_for_MERGE_mode\n"); } unichar* output=infos->fst2->tags[item->fst2_transition->tag_number]->output; unichar name[MAX_TRANSDUCTION_VAR_LENGTH]; int capture; struct dela_entry* old_value_dela=NULL; capture=is_capture_variable(output,name); if (capture) { /* If we have a capture variable $:X$, we must save the previous value * for this dictionary variable */ old_value_dela=clone_dela_entry(get_dic_variable(name,infos->dic_variables)); } Match saved_element=element->m; struct list_int* text_tags=item->text_tag_numbers; int captured_chars=0; /* We explore all the text tags */ while (text_tags!=NULL) { /* First, we restore the output string */ s->len=len; s->str[len]='\0'; captured_chars=0; /* We deal with the fst2 tag output, if any */ if (item->first_time) { /* We only have to process the output only once, * since it will have the same effect on all tfst tags. * * Example: the fst2 tag "cybercrime/ZZ" may match the two tfst tags "cyber" and * "crime", but we must process the "ZZ" output only before the first tfst tag "cyber" */ if (capture) { /* If we have a capture variable, then we have to check whether the tfst tag * is a tagged token or not */ int tfst_tag_number=text_tags->n; int fst2_tag_number=item->fst2_transition->tag_number; if (!do_variable_capture(tfst_tag_number,fst2_tag_number,infos,name)) { goto restore_dic_variable; } } else if (!deal_with_output_tfst(s,output,infos,&captured_chars)) { /* We do not take into account matches with variable errors if the * process_output_for_tfst_match function has decided that backtracking * was necessary, either because of a variable error of because of a * $a.SET$ or $a.UNSET$ test */ goto restore_dic_variable; } } int last_tag=last_text_dependent_tfst_tag; TfstTag* current_tag=NULL; if (text_tags->n==-1) { /* We have a text independent match */ Fst2Tag fst2_tag=infos->fst2->tags[item->fst2_transition->tag_number]; if (fst2_tag->type==BEGIN_OUTPUT_VAR_TAG) { /* If we an output variable start $|a( */ set_output_variable_pending(infos->output_variables,fst2_tag->variable); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); unset_output_variable_pending(infos->output_variables,fst2_tag->variable); goto restore_dic_variable; } else if (fst2_tag->type==END_OUTPUT_VAR_TAG) { /* If we an output variable end $|a) */ unset_output_variable_pending(infos->output_variables,fst2_tag->variable); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); set_output_variable_pending(infos->output_variables,fst2_tag->variable); goto restore_dic_variable; } else if (fst2_tag->type==BEGIN_VAR_TAG) { /* If we have a variable start tag $a(, we add it to our * variable tag list */ struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable); int old_value=v->start_in_tokens; /* We add the address of the start field to our list */ (*var_starts)=new_list_pointer(&(v->start_in_tokens),(var_starts==NULL)?NULL:(*var_starts)); /* Then, we go on the next item */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); /* After the exploration, there are 2 cases: * 1) *var_starts is NULL: nothing to do * 2) *var_starts is not NULL: we reached the end of the items without findind any * text dependent match, so we can free the list */ free_list_pointer(*var_starts); (*var_starts)=NULL; v->start_in_tokens=old_value; /* If we have a $a( tag, we know that we can only have just one text tag * with special value -1 */ goto restore_dic_variable; } else if (fst2_tag->type==END_VAR_TAG) { /* If we have found a $a) tag */ if (last_tag==-1) { /* If we have no tfst tag to use, then it's a variable definition error, * and we have nothing special to do */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); goto restore_dic_variable; } else { /* We can set the end of the variable, it's 'last_tag' */ struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable); int old_value=v->end_in_tokens; v->end_in_tokens=last_tag; explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); v->end_in_tokens=old_value; goto restore_dic_variable; } } else if (fst2_tag->type==LEFT_CONTEXT_TAG) { /* If we have found a $* tag, we must reset the stack string and the * start position, so we save them */ unichar* old_stack=u_strdup(s->str); int old_pos_token=element->m.start_pos_in_token; int old_pos_char=element->m.start_pos_in_char; int old_pos_letter=element->m.start_pos_in_letter; /* We set the new values */ empty(s); element->m.start_pos_in_token=LEFT_CONTEXT_PENDING; /* We must reset last_tag to -1, because is not, we will have an * extra space on the left of the match */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,-1,var_starts); /* And we restore previous values */ element->m.start_pos_in_token=old_pos_token; element->m.start_pos_in_char=old_pos_char; element->m.start_pos_in_letter=old_pos_letter; u_strcpy(s,old_stack); free(old_stack); /* If we have a $* tag, we know that we can only have just one text tag * with special value -1 */ goto restore_dic_variable; } } else { current_tag=(TfstTag*)(infos->tfst->tags->tab[text_tags->n]); /* We update the last tag */ last_tag=text_tags->n; /* If the current text tag is not a text independent one */ /* If there are some pending $a( tags, we set them to the current tag */ if (var_starts!=NULL) { struct list_pointer* ptr=(*var_starts); while (ptr!=NULL) { int* start=(int*)(ptr->pointer); (*start)=text_tags->n; ptr=ptr->next; } } int previous_start_token,previous_start_char; if (last_text_dependent_tfst_tag!=-1) { /* If the item is not the first, we must insert the original text that is * between the end of the previous merged text and the beginning of the * current one, typically to insert spaces */ TfstTag* previous_tag=(TfstTag*)(infos->tfst->tags->tab[last_text_dependent_tfst_tag]); previous_start_token=previous_tag->m.end_pos_in_token; previous_start_char=previous_tag->m.end_pos_in_char; /* We start just after the end of the previous match */ if (infos->tfst->token_content[previous_start_token][previous_start_char+1]!='\0') { /* If we were not at the end of the previous text token, we just inscrease * the char position */ previous_start_char++; } else { /* Otherwise, we go on the next token */ previous_start_token++; previous_start_char=0; } } else { /* Otherwise, we start on the beginning of the current text tag */ //error("current item=%d\n",text_tags->n); previous_start_token=current_tag->m.start_pos_in_token; previous_start_char=current_tag->m.start_pos_in_char; } /* Here we have to insert the text that is between current_start and current_end, * and then, the ouput of the fst2 transition */ if (infos->output_policy==MERGE_OUTPUTS) { insert_text_interval_tfst(infos,s,previous_start_token,previous_start_char, current_tag->m.end_pos_in_token,current_tag->m.end_pos_in_char); } } /* Then, we go on the next item */ struct list_pointer* ptr2=NULL; if (element->m.start_pos_in_token==LEFT_CONTEXT_PENDING && current_tag!=NULL) { element->m.start_pos_in_token=infos->tfst->offset_in_tokens+current_tag->m.start_pos_in_token; element->m.start_pos_in_char=current_tag->m.start_pos_in_char; element->m.start_pos_in_letter=current_tag->m.start_pos_in_letter; } explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag ,&ptr2 /* We have encountered a text dependent tag, so there is no * more pending start tag like $a( */ ); element->m=saved_element; /* If there was a $* tag pending */ free_list_pointer(ptr2); if (infos->ambiguous_output_policy==IGNORE_AMBIGUOUS_OUTPUTS) { /* If we don't want ambiguous outputs, then the first path is * enough for our purpose */ goto restore_dic_variable; } text_tags=text_tags->next; remove_chars_from_output_variables(infos->output_variables,captured_chars); /* We reset to 0, because if we exit the while normally, we don't want to * modify output variables twice when reaching the 'restore_dic_variable' * label */ captured_chars=0; } restore_dic_variable: /* We redo this about output variables here, since we may have jumped here directly */ remove_chars_from_output_variables(infos->output_variables,captured_chars); if (capture) { /* If we have a capture variable $:X$, we must restore the previous value * for this dictionary variable */ set_dic_variable(name,old_value_dela,&(infos->dic_variables),0); } }
/** * This function loads a DLF or a DLC. It computes information about tokens * that will be used during the Locate operation. For instance, if we have the * following line: * * extended,.A * * and if the .fst2 to be applied to the text contains the pattern <A> with, * number 456, then the function will mark the "extended" token to be matched * by the pattern 456. Moreover, all case variations will be taken into account, * so that the "Extended" and "EXTENDED" tokens will also be updated. * * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern' * indicate if the .fst2 contains the corresponding patterns. For instance, if * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a * compound word must be marked as be matched by this pattern. */ void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet, int number_of_patterns,int is_DIC_pattern, int is_CDIC_pattern, struct lemma_node* root,struct locate_parameters* parameters) { struct string_hash* tokens=parameters->tokens; U_FILE* f; unichar line[DIC_LINE_SIZE]; f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ); if (f==NULL) { error("Cannot open dictionary %s\n",dic_name); return; } /* We parse all the lines */ int lines=0; char name[FILENAME_MAX]; remove_path(dic_name,name); while (EOF!=u_fgets(line,f)) { lines++; if (lines%10000==0) { u_printf("%s: %d lines loaded... \r",name,lines); } if (line[0]=='/') { /* NOTE: DLF and DLC files are not supposed to contain comment * lines, but we test them, just in the case */ continue; } struct dela_entry* entry=tokenize_DELAF_line(line,1); if (entry==NULL) { /* This case should never happen */ error("Invalid dictionary line in load_dic_for_locate\n"); continue; } /* We add the inflected form to the list of forms associated to the lemma. * This will be used to replace patterns like "<be>" by the actual list of * forms that can be matched by it, for optimization reasons */ add_inflected_form_for_lemma(entry->inflected,entry->lemma,root); /* We get the list of all tokens that can be matched by the inflected form of this * this entry, with regards to case variations (see the "extended" example above). */ struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens); /* We save the list pointer to free it later */ struct list_int* ptr_copy=ptr; /* Here, we will deal with all simple words */ while (ptr!=NULL) { int i=ptr->n; /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */ parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK); if (number_of_patterns) { /* We look for matching patterns only if there are some */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); if (list!=NULL) { /* If we have some patterns to add */ if (parameters->matching_patterns[i]==NULL) { /* We allocate the pattern bit array, if needed */ parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT); } struct list_pointer* tmp=list; while (tmp!=NULL) { /* Then we add all the pattern numbers to the bit array */ set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1); tmp=tmp->next; } /* Finally, we free the constraint list */ free_list_pointer(list); } } ptr=ptr->next; } /* Finally, we free the token list */ free_list_int(ptr_copy); if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) { /* If the inflected form is a compound word */ if (is_DIC_pattern || is_CDIC_pattern) { /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we * must note that all compound words can be matched by them */ add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); } if (number_of_patterns) { /* We look for matching patterns only if there are some */ /* We look if the compound word can be matched by some patterns */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); struct list_pointer* tmp=list; while (tmp!=NULL) { /* If the word is matched by at least one pattern, we store it. */ int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number; add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); tmp=tmp->next; } free_list_pointer(list); } } free_dela_entry(entry); } if (lines>10000) { u_printf("\n"); } u_fclose(f); }
/** * Frees the memory associated to the given list, but not the pointers it * contains. */ void free_list_pointer(struct list_pointer* list,Abstract_allocator prv_alloc) { free_list_pointer(list,NULL,prv_alloc); }