/** * This function loads a DLF or a DLC. It computes information about tokens * that will be used during the Locate operation. For instance, if we have the * following line: * * extended,.A * * and if the .fst2 to be applied to the text contains the pattern <A> with, * number 456, then the function will mark the "extended" token to be matched * by the pattern 456. Moreover, all case variations will be taken into account, * so that the "Extended" and "EXTENDED" tokens will also be updated. * * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern' * indicate if the .fst2 contains the corresponding patterns. For instance, if * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a * compound word must be marked as be matched by this pattern. */ void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet, int number_of_patterns,int is_DIC_pattern, int is_CDIC_pattern, struct lemma_node* root,struct locate_parameters* parameters) { struct string_hash* tokens=parameters->tokens; U_FILE* f; unichar line[DIC_LINE_SIZE]; f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ); if (f==NULL) { error("Cannot open dictionary %s\n",dic_name); return; } /* We parse all the lines */ int lines=0; char name[FILENAME_MAX]; remove_path(dic_name,name); while (EOF!=u_fgets(line,f)) { lines++; if (lines%10000==0) { u_printf("%s: %d lines loaded... \r",name,lines); } if (line[0]=='/') { /* NOTE: DLF and DLC files are not supposed to contain comment * lines, but we test them, just in the case */ continue; } struct dela_entry* entry=tokenize_DELAF_line(line,1); if (entry==NULL) { /* This case should never happen */ error("Invalid dictionary line in load_dic_for_locate\n"); continue; } /* We add the inflected form to the list of forms associated to the lemma. * This will be used to replace patterns like "<be>" by the actual list of * forms that can be matched by it, for optimization reasons */ add_inflected_form_for_lemma(entry->inflected,entry->lemma,root); /* We get the list of all tokens that can be matched by the inflected form of this * this entry, with regards to case variations (see the "extended" example above). */ struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens); /* We save the list pointer to free it later */ struct list_int* ptr_copy=ptr; /* Here, we will deal with all simple words */ while (ptr!=NULL) { int i=ptr->n; /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */ parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK); if (number_of_patterns) { /* We look for matching patterns only if there are some */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); if (list!=NULL) { /* If we have some patterns to add */ if (parameters->matching_patterns[i]==NULL) { /* We allocate the pattern bit array, if needed */ parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT); } struct list_pointer* tmp=list; while (tmp!=NULL) { /* Then we add all the pattern numbers to the bit array */ set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1); tmp=tmp->next; } /* Finally, we free the constraint list */ free_list_pointer(list); } } ptr=ptr->next; } /* Finally, we free the token list */ free_list_int(ptr_copy); if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) { /* If the inflected form is a compound word */ if (is_DIC_pattern || is_CDIC_pattern) { /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we * must note that all compound words can be matched by them */ add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); } if (number_of_patterns) { /* We look for matching patterns only if there are some */ /* We look if the compound word can be matched by some patterns */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); struct list_pointer* tmp=list; while (tmp!=NULL) { /* If the word is matched by at least one pattern, we store it. */ int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number; add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); tmp=tmp->next; } free_list_pointer(list); } } free_dela_entry(entry); } if (lines>10000) { u_printf("\n"); } u_fclose(f); }
/** * Returns 1 if the given string contains only one token; 0 otherwise. */ int is_a_simple_token(const unichar* string,TokenizationPolicy tokenization_policy,const Alphabet* alph) { if (is_a_simple_word(string,tokenization_policy,alph) || (u_strlen(string)==1)) { return 1; } return 0; }
/** * Returns a control byte that represents the characteristics of the given token. */ unsigned char get_control_byte(const unichar* token,const Alphabet* alph,struct string_hash* err,TokenizationPolicy tokenization_policy) { int i; int tmp; unsigned char c=0; if (token==NULL || token[0]=='\0') { fatal_error("NULL or empty token in get_control_byte\n"); } /* We consider that a token starting with a letter is a word */ if (is_letter(token[0],alph)) { set_bit_mask(&c,MOT_TOKEN_BIT_MASK); /* If a token is a word, we check if it is in the 'err' word list * in order to answer the question <!DIC>. We perform this test in order * to avoid taking "priori" as an unknown word if the compound "a priori" * is in the text. */ if (err!=NULL && get_value_index(token,err,DONT_INSERT)!=-1) { set_bit_mask(&c,NOT_DIC_TOKEN_BIT_MASK); } if (is_upper(token[0],alph)) { set_bit_mask(&c,PRE_TOKEN_BIT_MASK); i=0; tmp=0; while (token[i]!='\0') { if (is_lower(token[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MAJ_TOKEN_BIT_MASK); } return c; } i=0; tmp=0; while (token[i]!='\0') { if (is_upper(token[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MIN_TOKEN_BIT_MASK); } return c; } /* If the token doesn't start with a letter, we start with * checking if it is a tag like {today,.ADV} */ if (token[0]=='{' && u_strcmp(token,"{S}") && u_strcmp(token,"{STOP}")) { /* Anyway, such a tag is classed as verifying <MOT> and <DIC> */ set_bit_mask(&c,MOT_TOKEN_BIT_MASK|DIC_TOKEN_BIT_MASK|TDIC_TOKEN_BIT_MASK); struct dela_entry* temp=tokenize_tag_token(token); if (is_upper(temp->inflected[0],alph)) { set_bit_mask(&c,PRE_TOKEN_BIT_MASK); i=0; tmp=0; while (temp->inflected[i]!='\0') { if (is_letter(temp->inflected[i],alph) && is_lower(temp->inflected[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MAJ_TOKEN_BIT_MASK); } } else { i=0; tmp=0; while (temp->inflected[i]!='\0') { if (is_letter(temp->inflected[i],alph) && is_upper(temp->inflected[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MIN_TOKEN_BIT_MASK); } } if (!is_a_simple_word(temp->inflected,tokenization_policy,alph)) { /* If the tag is a compound word, we say that it verifies the <CDIC> pattern */ set_bit_mask(&c,CDIC_TOKEN_BIT_MASK); } free_dela_entry(temp); } return c; }