Beispiel #1
0
/**
 * Performs the variable capture. Returns 1 in case of success; 0 otherwise.
 * Note that 1 is also returned if there is an error while the variable error
 * policy is IGNORE_VARIABLE_ERROR.
 */
int do_variable_capture(int tfst_tag_number, int fst2_tag_number,
		struct locate_tfst_infos* infos, unichar* name) {
if (tfst_tag_number == -1) {
	/* If we have a text independent match like <E>/$:X$, it's an error case */
	switch (infos->variable_error_policy) {
		case EXIT_ON_VARIABLE_ERRORS:
			fatal_error(
				"Should not have capture variable $:%S$ associated to text independent input %S\n",
				name, infos->fst2->tags[fst2_tag_number]->input);
		case IGNORE_VARIABLE_ERRORS: return 1;
		case BACKTRACK_ON_VARIABLE_ERRORS: return 0;
	}
}
TfstTag* tag = (TfstTag*) (infos->tfst->tags->tab[tfst_tag_number]);
if (tag->content[0] != '{' || tag->content[1] == '\0') {
	/* If we have a non tagged token like "foo" */
	switch (infos->variable_error_policy) {
		case EXIT_ON_VARIABLE_ERRORS:
			fatal_error(
				"Should not have capture variable $:%S$ associated to a tag that may capture untagged tokens: %S\n",
				name, infos->fst2->tags[fst2_tag_number]->input);
		case IGNORE_VARIABLE_ERRORS: return 1;
		case BACKTRACK_ON_VARIABLE_ERRORS: return 0;
	}
}
/* We can capture the tag */
struct dela_entry* e=tokenize_tag_token(tag->content);
if (e==NULL) {
	/* Should not happen */
	fatal_error("Unexpected tag tokenization error in do_variable_capture for tag:\n%S\n",tag->content);
}
set_dic_variable(name,e,&(infos->dic_variables),0);
return 1;
}
/**
 * This function optimizes a pattern of the form "eat".
 */
void optimize_token_pattern(int i,Fst2Tag* tag,Alphabet* alph,
               struct locate_parameters* p,Abstract_allocator prv_alloc) {
/* Whatever happens, this pattern will be turned into a token list */
tag[i]->type=TOKEN_LIST_TAG;
unichar* opt_token=tag[i]->pattern->inflected;
/* First, we check if this token pattern can recognize some tag tokens */
struct list_int* list=p->tag_token_list;
while (list!=NULL) {
   struct dela_entry* entry=tokenize_tag_token(p->tokens->value[list->n],1);
   if ((!is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK) && is_equal_or_uppercase(opt_token,entry->inflected,alph)) ||
       !u_strcmp(opt_token,entry->inflected)) {
      tag[i]->matching_tokens=sorted_insert(list->n,tag[i]->matching_tokens,prv_alloc);
   }
   free_dela_entry(entry);
   list=list->next;
}
/* Then, we look for normal tokens */
if (is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK)) {
   /* If no case variants are allowed, then we just have to insert the number
    * of the token, but only if this token in the text ones. */
   int token_number;
   if (-1!=(token_number=get_value_index(opt_token,p->tokens,DONT_INSERT))) {
      tag[i]->matching_tokens=sorted_insert(token_number,tag[i]->matching_tokens,prv_alloc);
   }
   return;
}
/* Here, we have to get all the case variants of the token. */
tag[i]->matching_tokens=destructive_sorted_merge(get_token_list_for_sequence(opt_token,alph,p->tokens,prv_alloc),tag[i]->matching_tokens,prv_alloc);
}
Beispiel #3
0
/**
 * This function checks for each tag token like "{extended,extend.V:K}"
 * if it verifies some patterns. Its behaviour is very similar to the one
 * of the load_dic_for_locate function. However, as a side effect, this
 * function fills 'tag_token_list' with the list of tag token numbers.
 * This list is later used during Locate preprocessings.
 */
void check_patterns_for_tag_tokens(Alphabet* alphabet,int number_of_patterns,
                                   struct lemma_node* root,struct locate_parameters* parameters,Abstract_allocator prv_alloc) {
    struct string_hash* tokens=parameters->tokens;
    for (int i=0; i<tokens->size; i++) {
        if (tokens->value[i][0]=='{' && u_strcmp(tokens->value[i],"{S}")  && u_strcmp(tokens->value[i],"{STOP}")) {
            /* If the token is tag like "{today,.ADV}", we add its number to the tag token list */
            parameters->tag_token_list=head_insert(i,parameters->tag_token_list,prv_alloc);
            /* And we look for the patterns that can match it */
            struct dela_entry* entry=tokenize_tag_token(tokens->value[i]);
            if (entry==NULL) {
                /* This should never happen */
                fatal_error("Invalid tag token in function check_patterns_for_tag_tokens\n");
            }
            /* We add the inflected form to the list of forms associated to the lemma.
            * This will be used to replace patterns like "<be>" by the actual list of
            * forms that can be matched by it, for optimization reasons */
            add_inflected_form_for_lemma(tokens->value[i],entry->lemma,root);
            parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK);
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                if (list!=NULL) {
                    if (parameters->matching_patterns[i]==NULL) {
                        /* We allocate the bit array if needed */
                        parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT);
                    }
                    struct list_pointer* tmp=list;
                    while (tmp!=NULL) {
                        set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1);
                        tmp=tmp->next;
                    }
                    free_list_pointer(list);
                }
            }
            /* At the opposite of DLC lines, a compound word tag like "{all around,.ADV}"
             * does not need to be put in the compound word tree, since the tag is already
             * characterized by its token number. */
            free_dela_entry(entry);
        }
    }
}
Beispiel #4
0
/**
 * Returns a control byte that represents the characteristics of the given token.
 */
unsigned char get_control_byte(const unichar* token,const Alphabet* alph,struct string_hash* err,TokenizationPolicy tokenization_policy) {
    int i;
    int tmp;
    unsigned char c=0;
    if (token==NULL || token[0]=='\0') {
        fatal_error("NULL or empty token in get_control_byte\n");
    }
    /* We consider that a token starting with a letter is a word */
    if (is_letter(token[0],alph)) {
        set_bit_mask(&c,MOT_TOKEN_BIT_MASK);
        /* If a token is a word, we check if it is in the 'err' word list
         * in order to answer the question <!DIC>. We perform this test in order
         * to avoid taking "priori" as an unknown word if the compound "a priori"
         * is in the text. */
        if (err!=NULL && get_value_index(token,err,DONT_INSERT)!=-1) {
            set_bit_mask(&c,NOT_DIC_TOKEN_BIT_MASK);
        }
        if (is_upper(token[0],alph)) {
            set_bit_mask(&c,PRE_TOKEN_BIT_MASK);
            i=0;
            tmp=0;
            while (token[i]!='\0') {
                if (is_lower(token[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MAJ_TOKEN_BIT_MASK);
            }
            return c;
        }
        i=0;
        tmp=0;
        while (token[i]!='\0') {
            if (is_upper(token[i],alph)) {
                tmp=1;
                break;
            }
            i++;
        }
        if (!tmp) {
            set_bit_mask(&c,MIN_TOKEN_BIT_MASK);
        }
        return c;
    }
    /* If the token doesn't start with a letter, we start with
     * checking if it is a tag like {today,.ADV} */
    if (token[0]=='{' && u_strcmp(token,"{S}") && u_strcmp(token,"{STOP}")) {
        /* Anyway, such a tag is classed as verifying <MOT> and <DIC> */
        set_bit_mask(&c,MOT_TOKEN_BIT_MASK|DIC_TOKEN_BIT_MASK|TDIC_TOKEN_BIT_MASK);
        struct dela_entry* temp=tokenize_tag_token(token);
        if (is_upper(temp->inflected[0],alph)) {
            set_bit_mask(&c,PRE_TOKEN_BIT_MASK);
            i=0;
            tmp=0;
            while (temp->inflected[i]!='\0') {
                if (is_letter(temp->inflected[i],alph) && is_lower(temp->inflected[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MAJ_TOKEN_BIT_MASK);
            }
        }
        else {
            i=0;
            tmp=0;
            while (temp->inflected[i]!='\0') {
                if (is_letter(temp->inflected[i],alph) && is_upper(temp->inflected[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MIN_TOKEN_BIT_MASK);
            }
        }
        if (!is_a_simple_word(temp->inflected,tokenization_policy,alph)) {
            /* If the tag is a compound word, we say that it verifies the <CDIC> pattern */
            set_bit_mask(&c,CDIC_TOKEN_BIT_MASK);
        }
        free_dela_entry(temp);
    }
    return c;
}
   return;
}
/* Here, we have to get all the case variants of the token. */
tag[i]->matching_tokens=destructive_sorted_merge(get_token_list_for_sequence(opt_token,alph,p->tokens,prv_alloc),tag[i]->matching_tokens,prv_alloc);
}


/**
 * This function checks if a pattern of the form "<eat>", "<eat.V>" or "<eaten,eat.V>"
 * can match the given tag token like "{today,.ADV}".
 */
void optimize_full_pattern_for_tag(unichar* tag_token,int i,Fst2Tag* tag,Alphabet* alph,
               struct locate_parameters* p,Abstract_allocator prv_alloc) {
DISCARD_UNUSED_PARAMETER(alph)
int token_number=get_value_index(tag_token,p->tokens);
struct dela_entry* entry=tokenize_tag_token(tag_token,1);
struct pattern* pattern=tag[i]->pattern;
if ((pattern->type==LEMMA_PATTERN) || (pattern->type==INFLECTED_AND_LEMMA_PATTERN)) {
   /* If the pattern has a constraint on the lemma, we check it */
   if (u_strcmp(entry->lemma,pattern->lemma)) {
      free_dela_entry(entry,prv_alloc);
      return;
   }
}
if ((pattern->type==LEMMA_AND_CODE_PATTERN) || (pattern->type==FULL_PATTERN)) {
   /* If the pattern contains a constraint on grammatical/semantic/inflectional
    * codes, then it has been put in the pattern tree, and so, this pattern
    * was tried on the current tag token in the 'check_patterns_for_tag_tokens'
    * function. Then, we just have to test if the tag token matches this pattern. */
   if (p->matching_patterns==NULL || p->matching_patterns[token_number]==NULL ||
       0==get_value(p->matching_patterns[token_number],tag[i]->pattern_number)) {