/** * This function loads a DLF or a DLC. It computes information about tokens * that will be used during the Locate operation. For instance, if we have the * following line: * * extended,.A * * and if the .fst2 to be applied to the text contains the pattern <A> with, * number 456, then the function will mark the "extended" token to be matched * by the pattern 456. Moreover, all case variations will be taken into account, * so that the "Extended" and "EXTENDED" tokens will also be updated. * * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern' * indicate if the .fst2 contains the corresponding patterns. For instance, if * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a * compound word must be marked as be matched by this pattern. */ void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet, int number_of_patterns,int is_DIC_pattern, int is_CDIC_pattern, struct lemma_node* root,struct locate_parameters* parameters) { struct string_hash* tokens=parameters->tokens; U_FILE* f; unichar line[DIC_LINE_SIZE]; f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ); if (f==NULL) { error("Cannot open dictionary %s\n",dic_name); return; } /* We parse all the lines */ int lines=0; char name[FILENAME_MAX]; remove_path(dic_name,name); while (EOF!=u_fgets(line,f)) { lines++; if (lines%10000==0) { u_printf("%s: %d lines loaded... \r",name,lines); } if (line[0]=='/') { /* NOTE: DLF and DLC files are not supposed to contain comment * lines, but we test them, just in the case */ continue; } struct dela_entry* entry=tokenize_DELAF_line(line,1); if (entry==NULL) { /* This case should never happen */ error("Invalid dictionary line in load_dic_for_locate\n"); continue; } /* We add the inflected form to the list of forms associated to the lemma. * This will be used to replace patterns like "<be>" by the actual list of * forms that can be matched by it, for optimization reasons */ add_inflected_form_for_lemma(entry->inflected,entry->lemma,root); /* We get the list of all tokens that can be matched by the inflected form of this * this entry, with regards to case variations (see the "extended" example above). */ struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens); /* We save the list pointer to free it later */ struct list_int* ptr_copy=ptr; /* Here, we will deal with all simple words */ while (ptr!=NULL) { int i=ptr->n; /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */ parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK); if (number_of_patterns) { /* We look for matching patterns only if there are some */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); if (list!=NULL) { /* If we have some patterns to add */ if (parameters->matching_patterns[i]==NULL) { /* We allocate the pattern bit array, if needed */ parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT); } struct list_pointer* tmp=list; while (tmp!=NULL) { /* Then we add all the pattern numbers to the bit array */ set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1); tmp=tmp->next; } /* Finally, we free the constraint list */ free_list_pointer(list); } } ptr=ptr->next; } /* Finally, we free the token list */ free_list_int(ptr_copy); if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) { /* If the inflected form is a compound word */ if (is_DIC_pattern || is_CDIC_pattern) { /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we * must note that all compound words can be matched by them */ add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); } if (number_of_patterns) { /* We look for matching patterns only if there are some */ /* We look if the compound word can be matched by some patterns */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); struct list_pointer* tmp=list; while (tmp!=NULL) { /* If the word is matched by at least one pattern, we store it. */ int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number; add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); tmp=tmp->next; } free_list_pointer(list); } } free_dela_entry(entry); } if (lines>10000) { u_printf("\n"); } u_fclose(f); }
/** * Adds a compound word to the tree 'DLC_tree' with the value * COMPOUND_WORD_PATTERN. This is used when the user looks * for any compound word, regardless the pattern, with <DIC> or <CDIC>. */ void add_compound_word_with_no_pattern(const unichar* word,const Alphabet* alph,struct string_hash* tok, struct DLC_tree_info* DLC_tree,TokenizationPolicy tokenization_mode) { add_compound_word_with_pattern(word,COMPOUND_WORD_PATTERN,alph,tok,DLC_tree, tokenization_mode); }