コード例 #1
0
ファイル: LocatePattern.cpp プロジェクト: adri87/Q-A
/**
 * This function loads a DLF or a DLC. It computes information about tokens
 * that will be used during the Locate operation. For instance, if we have the
 * following line:
 *
 *   extended,.A
 *
 * and if the .fst2 to be applied to the text contains the pattern <A> with,
 * number 456, then the function will mark the "extended" token to be matched
 * by the pattern 456. Moreover, all case variations will be taken into account,
 * so that the "Extended" and "EXTENDED" tokens will also be updated.
 *
 * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern'
 * indicate if the .fst2 contains the corresponding patterns. For instance, if
 * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a
 * compound word must be marked as be matched by this pattern.
 */
void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet,
                         int number_of_patterns,int is_DIC_pattern,
                         int is_CDIC_pattern,
                         struct lemma_node* root,struct locate_parameters* parameters) {
    struct string_hash* tokens=parameters->tokens;
    U_FILE* f;
    unichar line[DIC_LINE_SIZE];
    f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ);
    if (f==NULL) {
        error("Cannot open dictionary %s\n",dic_name);
        return;
    }
    /* We parse all the lines */
    int lines=0;
    char name[FILENAME_MAX];
    remove_path(dic_name,name);
    while (EOF!=u_fgets(line,f)) {
        lines++;
        if (lines%10000==0) {
            u_printf("%s: %d lines loaded...                          \r",name,lines);
        }
        if (line[0]=='/') {
            /* NOTE: DLF and DLC files are not supposed to contain comment
             *       lines, but we test them, just in the case */
            continue;
        }
        struct dela_entry* entry=tokenize_DELAF_line(line,1);
        if (entry==NULL) {
            /* This case should never happen */
            error("Invalid dictionary line in load_dic_for_locate\n");
            continue;
        }
        /* We add the inflected form to the list of forms associated to the lemma.
         * This will be used to replace patterns like "<be>" by the actual list of
         * forms that can be matched by it, for optimization reasons */
        add_inflected_form_for_lemma(entry->inflected,entry->lemma,root);
        /* We get the list of all tokens that can be matched by the inflected form of this
         * this entry, with regards to case variations (see the "extended" example above). */
        struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens);
        /* We save the list pointer to free it later */
        struct list_int* ptr_copy=ptr;
        /* Here, we will deal with all simple words */
        while (ptr!=NULL) {
            int i=ptr->n;
            /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */
            parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK);
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                if (list!=NULL) {
                    /* If we have some patterns to add */
                    if (parameters->matching_patterns[i]==NULL) {
                        /* We allocate the pattern bit array, if needed */
                        parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT);
                    }
                    struct list_pointer* tmp=list;
                    while (tmp!=NULL) {
                        /* Then we add all the pattern numbers to the bit array */
                        set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1);
                        tmp=tmp->next;
                    }
                    /* Finally, we free the constraint list */
                    free_list_pointer(list);
                }
            }
            ptr=ptr->next;
        }
        /* Finally, we free the token list */
        free_list_int(ptr_copy);
        if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) {
            /* If the inflected form is a compound word */
            if (is_DIC_pattern || is_CDIC_pattern) {
                /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we
                 * must note that all compound words can be matched by them */
                add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
            }
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                /* We look if the compound word can be matched by some patterns */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                struct list_pointer* tmp=list;
                while (tmp!=NULL) {
                    /* If the word is matched by at least one pattern, we store it. */
                    int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number;
                    add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
                    tmp=tmp->next;
                }
                free_list_pointer(list);
            }
        }
        free_dela_entry(entry);
    }
    if (lines>10000) {
        u_printf("\n");
    }
    u_fclose(f);
}
コード例 #2
0
/**
 * Adds a compound word to the tree 'DLC_tree' with the value
 * COMPOUND_WORD_PATTERN. This is used when the user looks
 * for any compound word, regardless the pattern, with <DIC> or <CDIC>.
 */
void add_compound_word_with_no_pattern(const unichar* word,const Alphabet* alph,struct string_hash* tok,
							struct DLC_tree_info* DLC_tree,TokenizationPolicy tokenization_mode) {
add_compound_word_with_pattern(word,COMPOUND_WORD_PATTERN,alph,tok,DLC_tree,
							tokenization_mode);
}