/**
 * Looks for a loop. To do that, we only follow transitions that can match E.
 * Every time we follow such a transition, we add it to the 'transitions' list.
 * This list is used to print the E loop if we find any. The function returns
 * 1 if a loop is found; 0 otherwise.
 */
static int find_an_E_loop(int* mark,int current_state,int graph,GrfCheckInfo* chk,
		struct list_pointer* transitions) {
if (mark[current_state]==1) {
	/* The state has been visited, nothing to do */
	return 0;
}
if (mark[current_state]==2) {
	/* The state is being visited, we have a loop */
	error("E loop in graph %S, made of the following tags:\n",chk->fst2->graph_names[graph]);
	print_reversed_list(transitions,chk->fst2,current_state,0);
	return 1;
}
/* We start visiting the state */
mark[current_state]=2;
SingleGraphState s=chk->condition_graphs[graph]->states[current_state];
Transition* t=s->outgoing_transitions;
while (t!=NULL) {
	if (transition_can_match_E(t->tag_number,chk)) {
		struct list_pointer* new_head=new_list_pointer(t,transitions);
		int res=find_an_E_loop(mark,t->state_number,graph,chk,new_head);
		new_head->next=NULL;
		free_list_pointer(new_head);
		if (res==1) {
			return 1;
		}
	}
	t=t->next;
}
/* The state has been fully visited */
mark[current_state]=1;
return 0;
}
static int find_a_left_recursion(int* mark_graph,int* mark_state,int current_state,int graph,
									GrfCheckInfo* chk,struct list_pointer* transitions) {
if (mark_state[current_state]==1) {
	/* The state has been visited, nothing to do */
	return 0;
}
if (mark_state[current_state]==2) {
	/* The state is being visited, we have a loop, but it should have been detected before */
	error("E loop in graph %S, made of the following tags:\n",chk->fst2->graph_names[graph]);
	print_reversed_list(transitions,chk->fst2,current_state,0);
	return 1;
}
/* We start visiting the state */
mark_state[current_state]=2;
SingleGraphState s=chk->condition_graphs[graph]->states[current_state];
Transition* t=s->outgoing_transitions;
while (t!=NULL) {
	if (t->tag_number<0) {
		/* As we look for left recursions, we always test recursively
		 * graph calls, regardless the fact that they may match E
		 */
		struct list_pointer* new_head=new_list_pointer(t,transitions);
		int res=is_left_recursion(chk,-(t->tag_number),mark_graph,new_head);
		new_head->next=NULL;
		free_list_pointer(new_head);
		if (res==1) {
			return 1;
		}
	}
	if (transition_can_match_E(t->tag_number,chk)) {
		struct list_pointer* new_head=new_list_pointer(t,transitions);
		int res=find_a_left_recursion(mark_graph,mark_state,t->state_number,graph,chk,new_head);
		new_head->next=NULL;
		free_list_pointer(new_head);
		if (res==1) {
			return 1;
		}
	}
	t=t->next;
}
/* The state has been fully visited */
mark_state[current_state]=1;
return 0;
}
示例#3
0
/**
 * This function explores the partial matches that constitute the given match in order to produce
 * one or all possible outputs, depending on infos->ambiguous_output_policy.
 * The output(s) is(are) then used to add matches to the infos->matches list.
 */
void explore_match_to_get_outputs(struct locate_tfst_infos* infos,struct tfst_match* m,
		                          struct tfst_simple_match_list* element) {
/* As m is a reversed list, we first need to get its elements in the right order */
vector_ptr* items=new_vector_ptr(16);
fill_vector(items,m);
Ustring* s=new_Ustring(1024);
/* In MERGE/REPLACE mode, we have to explore the combination of partial matches */
struct list_pointer* ptr=NULL;
explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,0,s,-1,&ptr);
free_list_pointer(ptr);
free_Ustring(s);
free_vector_ptr(items);
}
示例#4
0
/**
 * This function checks for each tag token like "{extended,extend.V:K}"
 * if it verifies some patterns. Its behaviour is very similar to the one
 * of the load_dic_for_locate function. However, as a side effect, this
 * function fills 'tag_token_list' with the list of tag token numbers.
 * This list is later used during Locate preprocessings.
 */
void check_patterns_for_tag_tokens(Alphabet* alphabet,int number_of_patterns,
                                   struct lemma_node* root,struct locate_parameters* parameters,Abstract_allocator prv_alloc) {
    struct string_hash* tokens=parameters->tokens;
    for (int i=0; i<tokens->size; i++) {
        if (tokens->value[i][0]=='{' && u_strcmp(tokens->value[i],"{S}")  && u_strcmp(tokens->value[i],"{STOP}")) {
            /* If the token is tag like "{today,.ADV}", we add its number to the tag token list */
            parameters->tag_token_list=head_insert(i,parameters->tag_token_list,prv_alloc);
            /* And we look for the patterns that can match it */
            struct dela_entry* entry=tokenize_tag_token(tokens->value[i]);
            if (entry==NULL) {
                /* This should never happen */
                fatal_error("Invalid tag token in function check_patterns_for_tag_tokens\n");
            }
            /* We add the inflected form to the list of forms associated to the lemma.
            * This will be used to replace patterns like "<be>" by the actual list of
            * forms that can be matched by it, for optimization reasons */
            add_inflected_form_for_lemma(tokens->value[i],entry->lemma,root);
            parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK);
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                if (list!=NULL) {
                    if (parameters->matching_patterns[i]==NULL) {
                        /* We allocate the bit array if needed */
                        parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT);
                    }
                    struct list_pointer* tmp=list;
                    while (tmp!=NULL) {
                        set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1);
                        tmp=tmp->next;
                    }
                    free_list_pointer(list);
                }
            }
            /* At the opposite of DLC lines, a compound word tag like "{all around,.ADV}"
             * does not need to be put in the compound word tree, since the tag is already
             * characterized by its token number. */
            free_dela_entry(entry);
        }
    }
}
示例#5
0
/**
 * Explores all the partial matches to produce outputs in MERGE or REPLACE mode.
 * 
 * If *var_starts!=NULL, it means that there are pending $var_start( tags
 * that wait for being taken into account when a text dependent tag is found.
 */
void explore_match_for_MERGE_or_REPLACE_mode(struct locate_tfst_infos* infos,
                                  struct tfst_simple_match_list* element,
                                  vector_ptr* items,int current_item,Ustring* s,
                                  int last_text_dependent_tfst_tag,
                                  struct list_pointer* *var_starts) {
if (current_item==items->nbelems) {
   /* If we have finished, we can save the current output */
   element->output=s->str;
   infos->matches=add_element_to_list(infos,infos->matches,element);
   element->output=NULL;
   return;
}
/* We save the length because it will be modified */
int len=s->len;
struct tfst_match* item=(struct tfst_match*)(items->tab[current_item]);
if (item==NULL) {
   fatal_error("Unexpected NULL item in explore_match_for_MERGE_mode\n");
}

unichar* output=infos->fst2->tags[item->fst2_transition->tag_number]->output;

unichar name[MAX_TRANSDUCTION_VAR_LENGTH];
int capture;
struct dela_entry* old_value_dela=NULL;
capture=is_capture_variable(output,name);
if (capture) {
	/* If we have a capture variable $:X$, we must save the previous value
	 * for this dictionary variable */
	old_value_dela=clone_dela_entry(get_dic_variable(name,infos->dic_variables));
}

Match saved_element=element->m;
struct list_int* text_tags=item->text_tag_numbers;
int captured_chars=0;
/* We explore all the text tags */
while (text_tags!=NULL) {
   /* First, we restore the output string */
   s->len=len;
   s->str[len]='\0';
   captured_chars=0;
   /* We deal with the fst2 tag output, if any */
   if (item->first_time) {
	   /* We only have to process the output only once,
	    * since it will have the same effect on all tfst tags.
	    *
	    * Example: the fst2 tag "cybercrime/ZZ" may match the two tfst tags "cyber" and
	    * "crime", but we must process the "ZZ" output only before the first tfst tag "cyber" */
	   if (capture) {
		   /* If we have a capture variable, then we have to check whether the tfst tag
	   	    * is a tagged token or not */
	   	   int tfst_tag_number=text_tags->n;
	   	   int fst2_tag_number=item->fst2_transition->tag_number;
	   	   if (!do_variable_capture(tfst_tag_number,fst2_tag_number,infos,name)) {
	   		   goto restore_dic_variable;
	   	   }
	   } else if (!deal_with_output_tfst(s,output,infos,&captured_chars)) {
         /* We do not take into account matches with variable errors if the
          * process_output_for_tfst_match function has decided that backtracking
          * was necessary, either because of a variable error of because of a
          * $a.SET$ or $a.UNSET$ test */
		  goto restore_dic_variable;
      }
   }
   int last_tag=last_text_dependent_tfst_tag;
   TfstTag* current_tag=NULL;
   if (text_tags->n==-1) {
      /* We have a text independent match */
      Fst2Tag fst2_tag=infos->fst2->tags[item->fst2_transition->tag_number];
      if (fst2_tag->type==BEGIN_OUTPUT_VAR_TAG) {
    	  /* If we an output variable start $|a( */
    	  set_output_variable_pending(infos->output_variables,fst2_tag->variable);
          explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
          unset_output_variable_pending(infos->output_variables,fst2_tag->variable);
          goto restore_dic_variable;
      } else if (fst2_tag->type==END_OUTPUT_VAR_TAG) {
    	  /* If we an output variable end $|a) */
    	  unset_output_variable_pending(infos->output_variables,fst2_tag->variable);
    	  explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
          set_output_variable_pending(infos->output_variables,fst2_tag->variable);
          goto restore_dic_variable;
      } else if (fst2_tag->type==BEGIN_VAR_TAG) {
         /* If we have a variable start tag $a(, we add it to our 
          * variable tag list */
         struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable);
         int old_value=v->start_in_tokens;
         /* We add the address of the start field to our list */
         (*var_starts)=new_list_pointer(&(v->start_in_tokens),(var_starts==NULL)?NULL:(*var_starts));
         /* Then, we go on the next item */
         explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
         /* After the exploration, there are 2 cases:
          * 1) *var_starts is NULL: nothing to do
          * 2) *var_starts is not NULL: we reached the end of the items without findind any
          *                             text dependent match, so we can free the list */
         free_list_pointer(*var_starts);
         (*var_starts)=NULL;
         v->start_in_tokens=old_value;
         /* If we have a $a( tag, we know that we can only have just one text tag 
          * with special value -1 */
         goto restore_dic_variable;
      } else if (fst2_tag->type==END_VAR_TAG) {
         /* If we have found a $a) tag */
         if (last_tag==-1) {
            /* If we have no tfst tag to use, then it's a variable definition error,
             * and we have nothing special to do */
            explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
            goto restore_dic_variable;
         } else {
            /* We can set the end of the variable, it's 'last_tag' */
            struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable);
            int old_value=v->end_in_tokens;
            v->end_in_tokens=last_tag;
            explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts);
            v->end_in_tokens=old_value;
            goto restore_dic_variable;
         }
      } else if (fst2_tag->type==LEFT_CONTEXT_TAG) {
         /* If we have found a $* tag, we must reset the stack string and the 
          * start position, so we save them */
         unichar* old_stack=u_strdup(s->str);
         int old_pos_token=element->m.start_pos_in_token;
         int old_pos_char=element->m.start_pos_in_char;
         int old_pos_letter=element->m.start_pos_in_letter;
         /* We set the new values */
         empty(s);
         element->m.start_pos_in_token=LEFT_CONTEXT_PENDING;
         /* We must reset last_tag to -1, because is not, we will have an 
          * extra space on the left of the match */
         explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,-1,var_starts);
         
         /* And we restore previous values */
         element->m.start_pos_in_token=old_pos_token;
         element->m.start_pos_in_char=old_pos_char;
         element->m.start_pos_in_letter=old_pos_letter;
         u_strcpy(s,old_stack);
         free(old_stack);
         /* If we have a $* tag, we know that we can only have just one text tag 
          * with special value -1 */
         goto restore_dic_variable;
      }
   } else {
      current_tag=(TfstTag*)(infos->tfst->tags->tab[text_tags->n]);
      /* We update the last tag */
      last_tag=text_tags->n;
      /* If the current text tag is not a text independent one */
      
      /* If there are some pending $a( tags, we set them to the current tag */
      if (var_starts!=NULL) {
         struct list_pointer* ptr=(*var_starts);
         while (ptr!=NULL) {
            int* start=(int*)(ptr->pointer);
            (*start)=text_tags->n;
            ptr=ptr->next;
         }
      }
      int previous_start_token,previous_start_char; 
      if (last_text_dependent_tfst_tag!=-1) {
         /* If the item is not the first, we must insert the original text that is
          * between the end of the previous merged text and the beginning of the
          * current one, typically to insert spaces */
         TfstTag* previous_tag=(TfstTag*)(infos->tfst->tags->tab[last_text_dependent_tfst_tag]);
         previous_start_token=previous_tag->m.end_pos_in_token;
         previous_start_char=previous_tag->m.end_pos_in_char;
         /* We start just after the end of the previous match */
         if (infos->tfst->token_content[previous_start_token][previous_start_char+1]!='\0') {
            /* If we were not at the end of the previous text token, we just inscrease
             * the char position */
            previous_start_char++;
         } else {
            /* Otherwise, we go on the next token */
            previous_start_token++;
            previous_start_char=0;
         }
      } else {
         /* Otherwise, we start on the beginning of the current text tag */
         //error("current item=%d\n",text_tags->n);
         previous_start_token=current_tag->m.start_pos_in_token;
         previous_start_char=current_tag->m.start_pos_in_char;
      }
      /* Here we have to insert the text that is between current_start and current_end,
       * and then, the ouput of the fst2 transition */
      if (infos->output_policy==MERGE_OUTPUTS) {
    	  insert_text_interval_tfst(infos,s,previous_start_token,previous_start_char,
                 current_tag->m.end_pos_in_token,current_tag->m.end_pos_in_char);
      }
   }
   /* Then, we go on the next item */
   struct list_pointer* ptr2=NULL;
   if (element->m.start_pos_in_token==LEFT_CONTEXT_PENDING && current_tag!=NULL) {
      element->m.start_pos_in_token=infos->tfst->offset_in_tokens+current_tag->m.start_pos_in_token;
      element->m.start_pos_in_char=current_tag->m.start_pos_in_char;
      element->m.start_pos_in_letter=current_tag->m.start_pos_in_letter;
   }
   explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag
         ,&ptr2 /* We have encountered a text dependent tag, so there is no
                 * more pending start tag like $a( */
         );
   element->m=saved_element;
   /* If there was a $* tag pending */
   free_list_pointer(ptr2);
   if (infos->ambiguous_output_policy==IGNORE_AMBIGUOUS_OUTPUTS) {
      /* If we don't want ambiguous outputs, then the first path is
       * enough for our purpose */ 
      goto restore_dic_variable;
   }
   text_tags=text_tags->next;
   remove_chars_from_output_variables(infos->output_variables,captured_chars);
   /* We reset to 0, because if we exit the while normally, we don't want to
    * modify output variables twice when reaching the 'restore_dic_variable'
    * label */
   captured_chars=0;
}
restore_dic_variable:
/* We redo this about output variables here, since we may have jumped here directly */
remove_chars_from_output_variables(infos->output_variables,captured_chars);
if (capture) {
	/* If we have a capture variable $:X$, we must restore the previous value
	 * for this dictionary variable */
	set_dic_variable(name,old_value_dela,&(infos->dic_variables),0);
}
}
示例#6
0
/**
 * This function loads a DLF or a DLC. It computes information about tokens
 * that will be used during the Locate operation. For instance, if we have the
 * following line:
 *
 *   extended,.A
 *
 * and if the .fst2 to be applied to the text contains the pattern <A> with,
 * number 456, then the function will mark the "extended" token to be matched
 * by the pattern 456. Moreover, all case variations will be taken into account,
 * so that the "Extended" and "EXTENDED" tokens will also be updated.
 *
 * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern'
 * indicate if the .fst2 contains the corresponding patterns. For instance, if
 * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a
 * compound word must be marked as be matched by this pattern.
 */
void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet,
                         int number_of_patterns,int is_DIC_pattern,
                         int is_CDIC_pattern,
                         struct lemma_node* root,struct locate_parameters* parameters) {
    struct string_hash* tokens=parameters->tokens;
    U_FILE* f;
    unichar line[DIC_LINE_SIZE];
    f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ);
    if (f==NULL) {
        error("Cannot open dictionary %s\n",dic_name);
        return;
    }
    /* We parse all the lines */
    int lines=0;
    char name[FILENAME_MAX];
    remove_path(dic_name,name);
    while (EOF!=u_fgets(line,f)) {
        lines++;
        if (lines%10000==0) {
            u_printf("%s: %d lines loaded...                          \r",name,lines);
        }
        if (line[0]=='/') {
            /* NOTE: DLF and DLC files are not supposed to contain comment
             *       lines, but we test them, just in the case */
            continue;
        }
        struct dela_entry* entry=tokenize_DELAF_line(line,1);
        if (entry==NULL) {
            /* This case should never happen */
            error("Invalid dictionary line in load_dic_for_locate\n");
            continue;
        }
        /* We add the inflected form to the list of forms associated to the lemma.
         * This will be used to replace patterns like "<be>" by the actual list of
         * forms that can be matched by it, for optimization reasons */
        add_inflected_form_for_lemma(entry->inflected,entry->lemma,root);
        /* We get the list of all tokens that can be matched by the inflected form of this
         * this entry, with regards to case variations (see the "extended" example above). */
        struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens);
        /* We save the list pointer to free it later */
        struct list_int* ptr_copy=ptr;
        /* Here, we will deal with all simple words */
        while (ptr!=NULL) {
            int i=ptr->n;
            /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */
            parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK);
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                if (list!=NULL) {
                    /* If we have some patterns to add */
                    if (parameters->matching_patterns[i]==NULL) {
                        /* We allocate the pattern bit array, if needed */
                        parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT);
                    }
                    struct list_pointer* tmp=list;
                    while (tmp!=NULL) {
                        /* Then we add all the pattern numbers to the bit array */
                        set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1);
                        tmp=tmp->next;
                    }
                    /* Finally, we free the constraint list */
                    free_list_pointer(list);
                }
            }
            ptr=ptr->next;
        }
        /* Finally, we free the token list */
        free_list_int(ptr_copy);
        if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) {
            /* If the inflected form is a compound word */
            if (is_DIC_pattern || is_CDIC_pattern) {
                /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we
                 * must note that all compound words can be matched by them */
                add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
            }
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                /* We look if the compound word can be matched by some patterns */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                struct list_pointer* tmp=list;
                while (tmp!=NULL) {
                    /* If the word is matched by at least one pattern, we store it. */
                    int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number;
                    add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
                    tmp=tmp->next;
                }
                free_list_pointer(list);
            }
        }
        free_dela_entry(entry);
    }
    if (lines>10000) {
        u_printf("\n");
    }
    u_fclose(f);
}
示例#7
0
/**
 * Frees the memory associated to the given list, but not the pointers it
 * contains.
 */
void free_list_pointer(struct list_pointer* list,Abstract_allocator prv_alloc) {
free_list_pointer(list,NULL,prv_alloc);
}