/** * Performs the variable capture. Returns 1 in case of success; 0 otherwise. * Note that 1 is also returned if there is an error while the variable error * policy is IGNORE_VARIABLE_ERROR. */ int do_variable_capture(int tfst_tag_number, int fst2_tag_number, struct locate_tfst_infos* infos, unichar* name) { if (tfst_tag_number == -1) { /* If we have a text independent match like <E>/$:X$, it's an error case */ switch (infos->variable_error_policy) { case EXIT_ON_VARIABLE_ERRORS: fatal_error( "Should not have capture variable $:%S$ associated to text independent input %S\n", name, infos->fst2->tags[fst2_tag_number]->input); case IGNORE_VARIABLE_ERRORS: return 1; case BACKTRACK_ON_VARIABLE_ERRORS: return 0; } } TfstTag* tag = (TfstTag*) (infos->tfst->tags->tab[tfst_tag_number]); if (tag->content[0] != '{' || tag->content[1] == '\0') { /* If we have a non tagged token like "foo" */ switch (infos->variable_error_policy) { case EXIT_ON_VARIABLE_ERRORS: fatal_error( "Should not have capture variable $:%S$ associated to a tag that may capture untagged tokens: %S\n", name, infos->fst2->tags[fst2_tag_number]->input); case IGNORE_VARIABLE_ERRORS: return 1; case BACKTRACK_ON_VARIABLE_ERRORS: return 0; } } /* We can capture the tag */ struct dela_entry* e=tokenize_tag_token(tag->content); if (e==NULL) { /* Should not happen */ fatal_error("Unexpected tag tokenization error in do_variable_capture for tag:\n%S\n",tag->content); } set_dic_variable(name,e,&(infos->dic_variables),0); return 1; }
/** * This function optimizes a pattern of the form "eat". */ void optimize_token_pattern(int i,Fst2Tag* tag,Alphabet* alph, struct locate_parameters* p,Abstract_allocator prv_alloc) { /* Whatever happens, this pattern will be turned into a token list */ tag[i]->type=TOKEN_LIST_TAG; unichar* opt_token=tag[i]->pattern->inflected; /* First, we check if this token pattern can recognize some tag tokens */ struct list_int* list=p->tag_token_list; while (list!=NULL) { struct dela_entry* entry=tokenize_tag_token(p->tokens->value[list->n],1); if ((!is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK) && is_equal_or_uppercase(opt_token,entry->inflected,alph)) || !u_strcmp(opt_token,entry->inflected)) { tag[i]->matching_tokens=sorted_insert(list->n,tag[i]->matching_tokens,prv_alloc); } free_dela_entry(entry); list=list->next; } /* Then, we look for normal tokens */ if (is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK)) { /* If no case variants are allowed, then we just have to insert the number * of the token, but only if this token in the text ones. */ int token_number; if (-1!=(token_number=get_value_index(opt_token,p->tokens,DONT_INSERT))) { tag[i]->matching_tokens=sorted_insert(token_number,tag[i]->matching_tokens,prv_alloc); } return; } /* Here, we have to get all the case variants of the token. */ tag[i]->matching_tokens=destructive_sorted_merge(get_token_list_for_sequence(opt_token,alph,p->tokens,prv_alloc),tag[i]->matching_tokens,prv_alloc); }
/** * This function checks for each tag token like "{extended,extend.V:K}" * if it verifies some patterns. Its behaviour is very similar to the one * of the load_dic_for_locate function. However, as a side effect, this * function fills 'tag_token_list' with the list of tag token numbers. * This list is later used during Locate preprocessings. */ void check_patterns_for_tag_tokens(Alphabet* alphabet,int number_of_patterns, struct lemma_node* root,struct locate_parameters* parameters,Abstract_allocator prv_alloc) { struct string_hash* tokens=parameters->tokens; for (int i=0; i<tokens->size; i++) { if (tokens->value[i][0]=='{' && u_strcmp(tokens->value[i],"{S}") && u_strcmp(tokens->value[i],"{STOP}")) { /* If the token is tag like "{today,.ADV}", we add its number to the tag token list */ parameters->tag_token_list=head_insert(i,parameters->tag_token_list,prv_alloc); /* And we look for the patterns that can match it */ struct dela_entry* entry=tokenize_tag_token(tokens->value[i]); if (entry==NULL) { /* This should never happen */ fatal_error("Invalid tag token in function check_patterns_for_tag_tokens\n"); } /* We add the inflected form to the list of forms associated to the lemma. * This will be used to replace patterns like "<be>" by the actual list of * forms that can be matched by it, for optimization reasons */ add_inflected_form_for_lemma(tokens->value[i],entry->lemma,root); parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK); if (number_of_patterns) { /* We look for matching patterns only if there are some */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); if (list!=NULL) { if (parameters->matching_patterns[i]==NULL) { /* We allocate the bit array if needed */ parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT); } struct list_pointer* tmp=list; while (tmp!=NULL) { set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1); tmp=tmp->next; } free_list_pointer(list); } } /* At the opposite of DLC lines, a compound word tag like "{all around,.ADV}" * does not need to be put in the compound word tree, since the tag is already * characterized by its token number. */ free_dela_entry(entry); } } }
/** * Returns a control byte that represents the characteristics of the given token. */ unsigned char get_control_byte(const unichar* token,const Alphabet* alph,struct string_hash* err,TokenizationPolicy tokenization_policy) { int i; int tmp; unsigned char c=0; if (token==NULL || token[0]=='\0') { fatal_error("NULL or empty token in get_control_byte\n"); } /* We consider that a token starting with a letter is a word */ if (is_letter(token[0],alph)) { set_bit_mask(&c,MOT_TOKEN_BIT_MASK); /* If a token is a word, we check if it is in the 'err' word list * in order to answer the question <!DIC>. We perform this test in order * to avoid taking "priori" as an unknown word if the compound "a priori" * is in the text. */ if (err!=NULL && get_value_index(token,err,DONT_INSERT)!=-1) { set_bit_mask(&c,NOT_DIC_TOKEN_BIT_MASK); } if (is_upper(token[0],alph)) { set_bit_mask(&c,PRE_TOKEN_BIT_MASK); i=0; tmp=0; while (token[i]!='\0') { if (is_lower(token[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MAJ_TOKEN_BIT_MASK); } return c; } i=0; tmp=0; while (token[i]!='\0') { if (is_upper(token[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MIN_TOKEN_BIT_MASK); } return c; } /* If the token doesn't start with a letter, we start with * checking if it is a tag like {today,.ADV} */ if (token[0]=='{' && u_strcmp(token,"{S}") && u_strcmp(token,"{STOP}")) { /* Anyway, such a tag is classed as verifying <MOT> and <DIC> */ set_bit_mask(&c,MOT_TOKEN_BIT_MASK|DIC_TOKEN_BIT_MASK|TDIC_TOKEN_BIT_MASK); struct dela_entry* temp=tokenize_tag_token(token); if (is_upper(temp->inflected[0],alph)) { set_bit_mask(&c,PRE_TOKEN_BIT_MASK); i=0; tmp=0; while (temp->inflected[i]!='\0') { if (is_letter(temp->inflected[i],alph) && is_lower(temp->inflected[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MAJ_TOKEN_BIT_MASK); } } else { i=0; tmp=0; while (temp->inflected[i]!='\0') { if (is_letter(temp->inflected[i],alph) && is_upper(temp->inflected[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MIN_TOKEN_BIT_MASK); } } if (!is_a_simple_word(temp->inflected,tokenization_policy,alph)) { /* If the tag is a compound word, we say that it verifies the <CDIC> pattern */ set_bit_mask(&c,CDIC_TOKEN_BIT_MASK); } free_dela_entry(temp); } return c; }
return; } /* Here, we have to get all the case variants of the token. */ tag[i]->matching_tokens=destructive_sorted_merge(get_token_list_for_sequence(opt_token,alph,p->tokens,prv_alloc),tag[i]->matching_tokens,prv_alloc); } /** * This function checks if a pattern of the form "<eat>", "<eat.V>" or "<eaten,eat.V>" * can match the given tag token like "{today,.ADV}". */ void optimize_full_pattern_for_tag(unichar* tag_token,int i,Fst2Tag* tag,Alphabet* alph, struct locate_parameters* p,Abstract_allocator prv_alloc) { DISCARD_UNUSED_PARAMETER(alph) int token_number=get_value_index(tag_token,p->tokens); struct dela_entry* entry=tokenize_tag_token(tag_token,1); struct pattern* pattern=tag[i]->pattern; if ((pattern->type==LEMMA_PATTERN) || (pattern->type==INFLECTED_AND_LEMMA_PATTERN)) { /* If the pattern has a constraint on the lemma, we check it */ if (u_strcmp(entry->lemma,pattern->lemma)) { free_dela_entry(entry,prv_alloc); return; } } if ((pattern->type==LEMMA_AND_CODE_PATTERN) || (pattern->type==FULL_PATTERN)) { /* If the pattern contains a constraint on grammatical/semantic/inflectional * codes, then it has been put in the pattern tree, and so, this pattern * was tried on the current tag token in the 'check_patterns_for_tag_tokens' * function. Then, we just have to test if the tag token matches this pattern. */ if (p->matching_patterns==NULL || p->matching_patterns[token_number]==NULL || 0==get_value(p->matching_patterns[token_number],tag[i]->pattern_number)) {