예제 #1
0
/**
 * This function loads a DLF or a DLC. It computes information about tokens
 * that will be used during the Locate operation. For instance, if we have the
 * following line:
 *
 *   extended,.A
 *
 * and if the .fst2 to be applied to the text contains the pattern <A> with,
 * number 456, then the function will mark the "extended" token to be matched
 * by the pattern 456. Moreover, all case variations will be taken into account,
 * so that the "Extended" and "EXTENDED" tokens will also be updated.
 *
 * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern'
 * indicate if the .fst2 contains the corresponding patterns. For instance, if
 * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a
 * compound word must be marked as be matched by this pattern.
 */
void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet,
                         int number_of_patterns,int is_DIC_pattern,
                         int is_CDIC_pattern,
                         struct lemma_node* root,struct locate_parameters* parameters) {
    struct string_hash* tokens=parameters->tokens;
    U_FILE* f;
    unichar line[DIC_LINE_SIZE];
    f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ);
    if (f==NULL) {
        error("Cannot open dictionary %s\n",dic_name);
        return;
    }
    /* We parse all the lines */
    int lines=0;
    char name[FILENAME_MAX];
    remove_path(dic_name,name);
    while (EOF!=u_fgets(line,f)) {
        lines++;
        if (lines%10000==0) {
            u_printf("%s: %d lines loaded...                          \r",name,lines);
        }
        if (line[0]=='/') {
            /* NOTE: DLF and DLC files are not supposed to contain comment
             *       lines, but we test them, just in the case */
            continue;
        }
        struct dela_entry* entry=tokenize_DELAF_line(line,1);
        if (entry==NULL) {
            /* This case should never happen */
            error("Invalid dictionary line in load_dic_for_locate\n");
            continue;
        }
        /* We add the inflected form to the list of forms associated to the lemma.
         * This will be used to replace patterns like "<be>" by the actual list of
         * forms that can be matched by it, for optimization reasons */
        add_inflected_form_for_lemma(entry->inflected,entry->lemma,root);
        /* We get the list of all tokens that can be matched by the inflected form of this
         * this entry, with regards to case variations (see the "extended" example above). */
        struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens);
        /* We save the list pointer to free it later */
        struct list_int* ptr_copy=ptr;
        /* Here, we will deal with all simple words */
        while (ptr!=NULL) {
            int i=ptr->n;
            /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */
            parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK);
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                if (list!=NULL) {
                    /* If we have some patterns to add */
                    if (parameters->matching_patterns[i]==NULL) {
                        /* We allocate the pattern bit array, if needed */
                        parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT);
                    }
                    struct list_pointer* tmp=list;
                    while (tmp!=NULL) {
                        /* Then we add all the pattern numbers to the bit array */
                        set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1);
                        tmp=tmp->next;
                    }
                    /* Finally, we free the constraint list */
                    free_list_pointer(list);
                }
            }
            ptr=ptr->next;
        }
        /* Finally, we free the token list */
        free_list_int(ptr_copy);
        if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) {
            /* If the inflected form is a compound word */
            if (is_DIC_pattern || is_CDIC_pattern) {
                /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we
                 * must note that all compound words can be matched by them */
                add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
            }
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                /* We look if the compound word can be matched by some patterns */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                struct list_pointer* tmp=list;
                while (tmp!=NULL) {
                    /* If the word is matched by at least one pattern, we store it. */
                    int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number;
                    add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
                    tmp=tmp->next;
                }
                free_list_pointer(list);
            }
        }
        free_dela_entry(entry);
    }
    if (lines>10000) {
        u_printf("\n");
    }
    u_fclose(f);
}
예제 #2
0
/**
 * Returns 1 if the given string contains only one token; 0 otherwise.
 */
int is_a_simple_token(const unichar* string,TokenizationPolicy tokenization_policy,const Alphabet* alph) {
if (is_a_simple_word(string,tokenization_policy,alph) || (u_strlen(string)==1)) {
   return 1;
}
return 0;
}
예제 #3
0
/**
 * Returns a control byte that represents the characteristics of the given token.
 */
unsigned char get_control_byte(const unichar* token,const Alphabet* alph,struct string_hash* err,TokenizationPolicy tokenization_policy) {
    int i;
    int tmp;
    unsigned char c=0;
    if (token==NULL || token[0]=='\0') {
        fatal_error("NULL or empty token in get_control_byte\n");
    }
    /* We consider that a token starting with a letter is a word */
    if (is_letter(token[0],alph)) {
        set_bit_mask(&c,MOT_TOKEN_BIT_MASK);
        /* If a token is a word, we check if it is in the 'err' word list
         * in order to answer the question <!DIC>. We perform this test in order
         * to avoid taking "priori" as an unknown word if the compound "a priori"
         * is in the text. */
        if (err!=NULL && get_value_index(token,err,DONT_INSERT)!=-1) {
            set_bit_mask(&c,NOT_DIC_TOKEN_BIT_MASK);
        }
        if (is_upper(token[0],alph)) {
            set_bit_mask(&c,PRE_TOKEN_BIT_MASK);
            i=0;
            tmp=0;
            while (token[i]!='\0') {
                if (is_lower(token[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MAJ_TOKEN_BIT_MASK);
            }
            return c;
        }
        i=0;
        tmp=0;
        while (token[i]!='\0') {
            if (is_upper(token[i],alph)) {
                tmp=1;
                break;
            }
            i++;
        }
        if (!tmp) {
            set_bit_mask(&c,MIN_TOKEN_BIT_MASK);
        }
        return c;
    }
    /* If the token doesn't start with a letter, we start with
     * checking if it is a tag like {today,.ADV} */
    if (token[0]=='{' && u_strcmp(token,"{S}") && u_strcmp(token,"{STOP}")) {
        /* Anyway, such a tag is classed as verifying <MOT> and <DIC> */
        set_bit_mask(&c,MOT_TOKEN_BIT_MASK|DIC_TOKEN_BIT_MASK|TDIC_TOKEN_BIT_MASK);
        struct dela_entry* temp=tokenize_tag_token(token);
        if (is_upper(temp->inflected[0],alph)) {
            set_bit_mask(&c,PRE_TOKEN_BIT_MASK);
            i=0;
            tmp=0;
            while (temp->inflected[i]!='\0') {
                if (is_letter(temp->inflected[i],alph) && is_lower(temp->inflected[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MAJ_TOKEN_BIT_MASK);
            }
        }
        else {
            i=0;
            tmp=0;
            while (temp->inflected[i]!='\0') {
                if (is_letter(temp->inflected[i],alph) && is_upper(temp->inflected[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MIN_TOKEN_BIT_MASK);
            }
        }
        if (!is_a_simple_word(temp->inflected,tokenization_policy,alph)) {
            /* If the tag is a compound word, we say that it verifies the <CDIC> pattern */
            set_bit_mask(&c,CDIC_TOKEN_BIT_MASK);
        }
        free_dela_entry(temp);
    }
    return c;
}