/** * Allocates, initializes and returns a new DELA entry. The 'clone' parameter * indicates whether the given entry must be cloned or not. If not, the function * only copies the pointed address. */ struct dela_entry_list* new_dela_entry_list(struct dela_entry* entry,int clone) { struct dela_entry_list* l; l=(struct dela_entry_list*)malloc(sizeof(struct dela_entry_list)); if (l==NULL) { fatal_alloc_error("new_dela_entry_list"); } l->next=NULL; if (clone) l->entry=clone_dela_entry(entry); else l->entry=entry; return l; }
/** * Allocates, initializes and returns a dic_variable. */ struct dic_variable* new_dic_variable(const unichar* name,struct dela_entry* dic_entry, struct dic_variable* next,int must_clone) { struct dic_variable* tmp=(struct dic_variable*)malloc(sizeof(struct dic_variable)); if (tmp==NULL) { fatal_alloc_error("new_dic_variable"); } tmp->name=u_strdup(name); if (must_clone) { tmp->dic_entry=clone_dela_entry(dic_entry); } else { tmp->dic_entry=dic_entry; } tmp->next=next; return tmp; }
/** * Sets the given dic variable, inserting it in the variable list if absent. */ void set_dic_variable(const unichar* name,struct dela_entry* dic_entry,struct dic_variable* *list,int must_clone) { while (*list!=NULL) { if (!u_strcmp((*list)->name,name)) { /* If we have found the variable we were looking for */ /* We have to free the previous value */ free_dela_entry((*list)->dic_entry); if (must_clone) { (*list)->dic_entry=clone_dela_entry(dic_entry); } else { (*list)->dic_entry=dic_entry; } return; } list=&((*list)->next); } *list=new_dic_variable(name,dic_entry,NULL,must_clone); }
/** * Explores all the partial matches to produce outputs in MERGE or REPLACE mode. * * If *var_starts!=NULL, it means that there are pending $var_start( tags * that wait for being taken into account when a text dependent tag is found. */ void explore_match_for_MERGE_or_REPLACE_mode(struct locate_tfst_infos* infos, struct tfst_simple_match_list* element, vector_ptr* items,int current_item,Ustring* s, int last_text_dependent_tfst_tag, struct list_pointer* *var_starts) { if (current_item==items->nbelems) { /* If we have finished, we can save the current output */ element->output=s->str; infos->matches=add_element_to_list(infos,infos->matches,element); element->output=NULL; return; } /* We save the length because it will be modified */ int len=s->len; struct tfst_match* item=(struct tfst_match*)(items->tab[current_item]); if (item==NULL) { fatal_error("Unexpected NULL item in explore_match_for_MERGE_mode\n"); } unichar* output=infos->fst2->tags[item->fst2_transition->tag_number]->output; unichar name[MAX_TRANSDUCTION_VAR_LENGTH]; int capture; struct dela_entry* old_value_dela=NULL; capture=is_capture_variable(output,name); if (capture) { /* If we have a capture variable $:X$, we must save the previous value * for this dictionary variable */ old_value_dela=clone_dela_entry(get_dic_variable(name,infos->dic_variables)); } Match saved_element=element->m; struct list_int* text_tags=item->text_tag_numbers; int captured_chars=0; /* We explore all the text tags */ while (text_tags!=NULL) { /* First, we restore the output string */ s->len=len; s->str[len]='\0'; captured_chars=0; /* We deal with the fst2 tag output, if any */ if (item->first_time) { /* We only have to process the output only once, * since it will have the same effect on all tfst tags. * * Example: the fst2 tag "cybercrime/ZZ" may match the two tfst tags "cyber" and * "crime", but we must process the "ZZ" output only before the first tfst tag "cyber" */ if (capture) { /* If we have a capture variable, then we have to check whether the tfst tag * is a tagged token or not */ int tfst_tag_number=text_tags->n; int fst2_tag_number=item->fst2_transition->tag_number; if (!do_variable_capture(tfst_tag_number,fst2_tag_number,infos,name)) { goto restore_dic_variable; } } else if (!deal_with_output_tfst(s,output,infos,&captured_chars)) { /* We do not take into account matches with variable errors if the * process_output_for_tfst_match function has decided that backtracking * was necessary, either because of a variable error of because of a * $a.SET$ or $a.UNSET$ test */ goto restore_dic_variable; } } int last_tag=last_text_dependent_tfst_tag; TfstTag* current_tag=NULL; if (text_tags->n==-1) { /* We have a text independent match */ Fst2Tag fst2_tag=infos->fst2->tags[item->fst2_transition->tag_number]; if (fst2_tag->type==BEGIN_OUTPUT_VAR_TAG) { /* If we an output variable start $|a( */ set_output_variable_pending(infos->output_variables,fst2_tag->variable); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); unset_output_variable_pending(infos->output_variables,fst2_tag->variable); goto restore_dic_variable; } else if (fst2_tag->type==END_OUTPUT_VAR_TAG) { /* If we an output variable end $|a) */ unset_output_variable_pending(infos->output_variables,fst2_tag->variable); explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); set_output_variable_pending(infos->output_variables,fst2_tag->variable); goto restore_dic_variable; } else if (fst2_tag->type==BEGIN_VAR_TAG) { /* If we have a variable start tag $a(, we add it to our * variable tag list */ struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable); int old_value=v->start_in_tokens; /* We add the address of the start field to our list */ (*var_starts)=new_list_pointer(&(v->start_in_tokens),(var_starts==NULL)?NULL:(*var_starts)); /* Then, we go on the next item */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); /* After the exploration, there are 2 cases: * 1) *var_starts is NULL: nothing to do * 2) *var_starts is not NULL: we reached the end of the items without findind any * text dependent match, so we can free the list */ free_list_pointer(*var_starts); (*var_starts)=NULL; v->start_in_tokens=old_value; /* If we have a $a( tag, we know that we can only have just one text tag * with special value -1 */ goto restore_dic_variable; } else if (fst2_tag->type==END_VAR_TAG) { /* If we have found a $a) tag */ if (last_tag==-1) { /* If we have no tfst tag to use, then it's a variable definition error, * and we have nothing special to do */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); goto restore_dic_variable; } else { /* We can set the end of the variable, it's 'last_tag' */ struct transduction_variable* v=get_transduction_variable(infos->input_variables,fst2_tag->variable); int old_value=v->end_in_tokens; v->end_in_tokens=last_tag; explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag,var_starts); v->end_in_tokens=old_value; goto restore_dic_variable; } } else if (fst2_tag->type==LEFT_CONTEXT_TAG) { /* If we have found a $* tag, we must reset the stack string and the * start position, so we save them */ unichar* old_stack=u_strdup(s->str); int old_pos_token=element->m.start_pos_in_token; int old_pos_char=element->m.start_pos_in_char; int old_pos_letter=element->m.start_pos_in_letter; /* We set the new values */ empty(s); element->m.start_pos_in_token=LEFT_CONTEXT_PENDING; /* We must reset last_tag to -1, because is not, we will have an * extra space on the left of the match */ explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,-1,var_starts); /* And we restore previous values */ element->m.start_pos_in_token=old_pos_token; element->m.start_pos_in_char=old_pos_char; element->m.start_pos_in_letter=old_pos_letter; u_strcpy(s,old_stack); free(old_stack); /* If we have a $* tag, we know that we can only have just one text tag * with special value -1 */ goto restore_dic_variable; } } else { current_tag=(TfstTag*)(infos->tfst->tags->tab[text_tags->n]); /* We update the last tag */ last_tag=text_tags->n; /* If the current text tag is not a text independent one */ /* If there are some pending $a( tags, we set them to the current tag */ if (var_starts!=NULL) { struct list_pointer* ptr=(*var_starts); while (ptr!=NULL) { int* start=(int*)(ptr->pointer); (*start)=text_tags->n; ptr=ptr->next; } } int previous_start_token,previous_start_char; if (last_text_dependent_tfst_tag!=-1) { /* If the item is not the first, we must insert the original text that is * between the end of the previous merged text and the beginning of the * current one, typically to insert spaces */ TfstTag* previous_tag=(TfstTag*)(infos->tfst->tags->tab[last_text_dependent_tfst_tag]); previous_start_token=previous_tag->m.end_pos_in_token; previous_start_char=previous_tag->m.end_pos_in_char; /* We start just after the end of the previous match */ if (infos->tfst->token_content[previous_start_token][previous_start_char+1]!='\0') { /* If we were not at the end of the previous text token, we just inscrease * the char position */ previous_start_char++; } else { /* Otherwise, we go on the next token */ previous_start_token++; previous_start_char=0; } } else { /* Otherwise, we start on the beginning of the current text tag */ //error("current item=%d\n",text_tags->n); previous_start_token=current_tag->m.start_pos_in_token; previous_start_char=current_tag->m.start_pos_in_char; } /* Here we have to insert the text that is between current_start and current_end, * and then, the ouput of the fst2 transition */ if (infos->output_policy==MERGE_OUTPUTS) { insert_text_interval_tfst(infos,s,previous_start_token,previous_start_char, current_tag->m.end_pos_in_token,current_tag->m.end_pos_in_char); } } /* Then, we go on the next item */ struct list_pointer* ptr2=NULL; if (element->m.start_pos_in_token==LEFT_CONTEXT_PENDING && current_tag!=NULL) { element->m.start_pos_in_token=infos->tfst->offset_in_tokens+current_tag->m.start_pos_in_token; element->m.start_pos_in_char=current_tag->m.start_pos_in_char; element->m.start_pos_in_letter=current_tag->m.start_pos_in_letter; } explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,current_item+1,s,last_tag ,&ptr2 /* We have encountered a text dependent tag, so there is no * more pending start tag like $a( */ ); element->m=saved_element; /* If there was a $* tag pending */ free_list_pointer(ptr2); if (infos->ambiguous_output_policy==IGNORE_AMBIGUOUS_OUTPUTS) { /* If we don't want ambiguous outputs, then the first path is * enough for our purpose */ goto restore_dic_variable; } text_tags=text_tags->next; remove_chars_from_output_variables(infos->output_variables,captured_chars); /* We reset to 0, because if we exit the while normally, we don't want to * modify output variables twice when reaching the 'restore_dic_variable' * label */ captured_chars=0; } restore_dic_variable: /* We redo this about output variables here, since we may have jumped here directly */ remove_chars_from_output_variables(infos->output_variables,captured_chars); if (capture) { /* If we have a capture variable $:X$, we must restore the previous value * for this dictionary variable */ set_dic_variable(name,old_value_dela,&(infos->dic_variables),0); } }