/** * This function optimizes a pattern of the form "eat". */ void optimize_token_pattern(int i,Fst2Tag* tag,Alphabet* alph, struct locate_parameters* p,Abstract_allocator prv_alloc) { /* Whatever happens, this pattern will be turned into a token list */ tag[i]->type=TOKEN_LIST_TAG; unichar* opt_token=tag[i]->pattern->inflected; /* First, we check if this token pattern can recognize some tag tokens */ struct list_int* list=p->tag_token_list; while (list!=NULL) { struct dela_entry* entry=tokenize_tag_token(p->tokens->value[list->n],1); if ((!is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK) && is_equal_or_uppercase(opt_token,entry->inflected,alph)) || !u_strcmp(opt_token,entry->inflected)) { tag[i]->matching_tokens=sorted_insert(list->n,tag[i]->matching_tokens,prv_alloc); } free_dela_entry(entry); list=list->next; } /* Then, we look for normal tokens */ if (is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK)) { /* If no case variants are allowed, then we just have to insert the number * of the token, but only if this token in the text ones. */ int token_number; if (-1!=(token_number=get_value_index(opt_token,p->tokens,DONT_INSERT))) { tag[i]->matching_tokens=sorted_insert(token_number,tag[i]->matching_tokens,prv_alloc); } return; } /* Here, we have to get all the case variants of the token. */ tag[i]->matching_tokens=destructive_sorted_merge(get_token_list_for_sequence(opt_token,alph,p->tokens,prv_alloc),tag[i]->matching_tokens,prv_alloc); }
/** * Prints the given hypotheses to the output, and if needed, * print the word to the modified input file. */ static void display_hypotheses(unichar* word,SpellCheckHypothesis* list,SpellCheckConfig* cfg) { Ustring* line=new_Ustring(128); int printed=0; while (list!=NULL) { printed=1; struct dela_entry* entry=tokenize_DELAF_line(list->entry); if (entry==NULL) { fatal_error("Internal error in display_hypotheses; cannot tokenize entry:\n%S\n",list->entry); } unichar* inflected=entry->inflected; entry->inflected=u_strdup(word); entry->semantic_codes[entry->n_semantic_codes++]=u_strdup("SP_ERR"); u_sprintf(line,"SP_INF=%S",inflected); entry->semantic_codes[entry->n_semantic_codes++]=u_strdup(line->str); dela_entry_to_string(line,entry); u_fprintf(cfg->out,"%S/score=%d\n",line->str,list->score); free(inflected); free_dela_entry(entry); list=list->next; } free_Ustring(line); /* Now, we may have to print the word to the modified input file */ if (cfg->input_op=='M') { /* If we must keep matched words, then we print the word if it had matched */ if (printed) u_fprintf(cfg->modified_input,"%S\n",word); } else if (cfg->input_op=='U') { /* If we must keep unmatched words, then we print the word if it had matched */ if (!printed) u_fprintf(cfg->modified_input,"%S\n",word); } }
/** * Frees all the memory associated to the given DELA entry list. */ void free_dela_entry_list(struct dela_entry_list* l) { struct dela_entry_list* tmp; while (l!=NULL) { tmp=l; l=l->next; free_dela_entry(tmp->entry); free(tmp); } }
void free_all_dic_entries (vector_ptr* entry_collection) { for (int i=0;i<entry_collection->nbelems;i++) { struct dela_entry* r=(struct dela_entry*)entry_collection->tab[i]; free_dela_entry(r); /* We don't want the vector to be freed now, and we don't want free_vector_ptr to crash */ entry_collection->tab[i]=NULL; } entry_collection->nbelems=0; }
// // returns 1 if the INF code refers to a valid right component, 0 else // char check_valid_right_component_for_one_INF_code_german(const unichar* s) { unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,s); struct dela_entry* d=tokenize_DELAF_line(temp,0); char res=check_N_not_FF(d); free_dela_entry(d); return res; }
/** * Saves the lines. */ void save(struct sort_infos* inf) { u_printf("Sorting and saving...\n"); /* -1 means that no line at all was already printed */ struct dela_entry* last = (struct dela_entry*)-1; int return_value = explore_node(inf->root, inf, &last); if (return_value == SUCCESS_RETURN_CODE && last != NULL && last!=(struct dela_entry*)-1) { u_fprintf(inf->f_out, "\n"); free_dela_entry(last); } }
/** * Returns 1 if the dictionary line refers to a verb with more than 4 * letters and 0 otherwise. */ char verb_of_more_than_4_letters(unichar* line) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ struct dela_entry* d=tokenize_DELAF_line(line,0); char res=check_V_but_not_Y(d) && u_strlen(d->inflected)>4; /* We free the artifical dictionary entry */ free_dela_entry(d); return res; }
/** * Inserts the given entry in the given entry list, if not already present. * If the entry is already present, then it is freed. */ struct dela_entry_list* insert_if_not_present(struct dela_entry* entry, struct dela_entry_list* l) { if (l==NULL) return new_dela_entry_list(entry,0); if (equal(l->entry,entry)) { free_dela_entry(entry); return l; } l->next=insert_if_not_present(entry,l->next); return l; }
int check_is_valid_for_one_INF_code(const unichar* t, const unichar* s) { unichar temp[MAX_DICT_LINE_LENGTH]; u_strcpy(temp,"x,"); u_strcat(temp,s); struct dela_entry* d = tokenize_DELAF_line(temp,0); int res = check_is_valid(t, d); free_dela_entry(d); return res; }
/** * Returns 1 if the given INF code is a ":a" one. */ char check_a(unichar* INF_code) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,INF_code); struct dela_entry* d=tokenize_DELAF_line(temp,0); char res=check_a(d); /* We free the artifical dictionary entry */ free_dela_entry(d); return res; }
/** * Returns 1 if the INF code refers to a valid left component, 0 otherwise. */ char check_valid_right_component_for_one_INF_code(const unichar* INF_code) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,INF_code); struct dela_entry* d=tokenize_DELAF_line(temp,0); char res=(check_N(d)||check_A(d)/*||check_V_but_not_Y(d)*/)&&(!check_Nsie(d)); /* We free the artifical dictionary entry */ free_dela_entry(d); return res; }
/** * Returns 1 if the INF code refers to a valid left component, 0 otherwise. */ char check_valid_left_component_for_one_INF_code(const unichar* INF_code) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,INF_code); struct dela_entry* d=tokenize_DELAF_line(temp,0); /* Now, we can use this structured representation to check if the INF code * corresponds to a valid left component. */ char res=check_Nsia(d)||check_Nsie(d)||check_Nsig(d)||check_Asio(d)||check_Asie(d)||check_VW(d)||check_ADV(d); /* Finally, we free the artificial dictionary entry */ free_dela_entry(d); return res; }
/** * Returns 1 if the line is a valid right "A" component. */ char check_A_right_component(unichar* s) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,s); struct dela_entry* d=tokenize_DELAF_line(temp,0); unichar t1[2]; u_strcpy(t1,"A"); unichar t2[4]; u_strcpy(t2,"sie"); char res=dic_entry_contain_gram_code(d,t1) && !dic_entry_contain_inflectional_code(d,t2); /* We free the artifical dictionary entry */ free_dela_entry(d); return res; }
/** * Loads the given DELAF and modifies the given keywords accordingly by * replacing any non removed token that appear in a DELAF entry * by its lemma. If there are ambiguities, several keywords are * generated. Doing that may merge keywords by adding their weights: * eats/2 + eaten/3 => eat/5 */ void filter_keywords_with_dic(struct string_hash_ptr* keywords,char* name, VersatileEncodingConfig* vec,Alphabet* alphabet) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) { error("Cannot load file %s\n",name); return; } Ustring* line=new_Ustring(128); while (EOF!=readline(line,f)) { struct dela_entry* e=tokenize_DELAF_line(line->str); if (e==NULL) continue; lemmatize(e,keywords,alphabet); free_dela_entry(e); } free_Ustring(line); u_fclose(f); }
/** * Sets the given dic variable, inserting it in the variable list if absent. */ void set_dic_variable(const unichar* name,struct dela_entry* dic_entry,struct dic_variable* *list,int must_clone) { while (*list!=NULL) { if (!u_strcmp((*list)->name,name)) { /* If we have found the variable we were looking for */ /* We have to free the previous value */ free_dela_entry((*list)->dic_entry); if (must_clone) { (*list)->dic_entry=clone_dela_entry(dic_entry); } else { (*list)->dic_entry=dic_entry; } return; } list=&((*list)->next); } *list=new_dic_variable(name,dic_entry,NULL,must_clone); }
/** * This function checks for each tag token like "{extended,extend.V:K}" * if it verifies some patterns. Its behaviour is very similar to the one * of the load_dic_for_locate function. However, as a side effect, this * function fills 'tag_token_list' with the list of tag token numbers. * This list is later used during Locate preprocessings. */ void check_patterns_for_tag_tokens(Alphabet* alphabet,int number_of_patterns, struct lemma_node* root,struct locate_parameters* parameters,Abstract_allocator prv_alloc) { struct string_hash* tokens=parameters->tokens; for (int i=0; i<tokens->size; i++) { if (tokens->value[i][0]=='{' && u_strcmp(tokens->value[i],"{S}") && u_strcmp(tokens->value[i],"{STOP}")) { /* If the token is tag like "{today,.ADV}", we add its number to the tag token list */ parameters->tag_token_list=head_insert(i,parameters->tag_token_list,prv_alloc); /* And we look for the patterns that can match it */ struct dela_entry* entry=tokenize_tag_token(tokens->value[i]); if (entry==NULL) { /* This should never happen */ fatal_error("Invalid tag token in function check_patterns_for_tag_tokens\n"); } /* We add the inflected form to the list of forms associated to the lemma. * This will be used to replace patterns like "<be>" by the actual list of * forms that can be matched by it, for optimization reasons */ add_inflected_form_for_lemma(tokens->value[i],entry->lemma,root); parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK); if (number_of_patterns) { /* We look for matching patterns only if there are some */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); if (list!=NULL) { if (parameters->matching_patterns[i]==NULL) { /* We allocate the bit array if needed */ parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT); } struct list_pointer* tmp=list; while (tmp!=NULL) { set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1); tmp=tmp->next; } free_list_pointer(list); } } /* At the opposite of DLC lines, a compound word tag like "{all around,.ADV}" * does not need to be put in the compound word tree, since the tag is already * characterized by its token number. */ free_dela_entry(entry); } } }
/** * This function analyzes an INF code and returns a value that indicates * if it is a valid left component or not. */ int get_valid_left_component_type_for_one_INF_code(const unichar* INF_code) { /* We produce an artifical dictionary entry with the given INF code, * and then, we tokenize it in order to get grammatical and inflectional * codes in a structured way. */ unichar temp[2000]; u_strcpy(temp,"x,"); u_strcat(temp,INF_code); struct dela_entry* d=tokenize_DELAF_line(temp,0); int res; /* Now we can test if the INF code corresponds to a valid left component */ if (check_Nsia(d)) res=N_SIA; else if (check_Nsie(d)) res=N_SIE; else if (check_Nsig(d)) res=N_SIG; else if (check_Asio(d)) res=A_SIO; else if (check_Asie(d)) res=A_SIE; else if (check_VW(d)) res=V_W; else if (check_ADV(d)) res=ADV; else res=INVALID_LEFT_COMPONENT; /* Finally we free the artifical dictionary entry */ free_dela_entry(d); return res; }
/** * This explores the dictionary in order decompose the given word into a valid sequence * of simple words. For instance, if we have the word "Sommervarmt", we will first * explore the dictionary and find that "sommer" is a valid left component that * corresponds to the dictionary entry "sommer,.N:msia". Then we will * look if the following word "varmt" is in the dictionary. It is * the case, with the entry "varmt,varm.A:nsio". As we are at the end of the word to * analyze and as "varmt" is a valid rightmost component, we will generate an entry * according to the following things: * * 'output_dela_line'="sommervarmt,sommervarm.A:nsio" * 'analysis'="sommer,.N:msia +++ varmt,varm.A:nsio" * 'number_of_components'=2 * * Note that the initial "S" was put in lowercase, because the dictionary * contains "sommer" and not "Sommer". The lemma is obtained with * the lemma of the rightmost component (here "varm"), and the word inherits * from the grammatical information of its rightmost component. * * 'offset': offset of the current node in the binary array 'infos->bin' * 'current_component': string that represents the current simple word * 'pos_in_current_component': position in the string 'current_component' * 'word_to_analyze': the word to analyze * 'pos_in_word_to_analyze': position in the string 'word_to_analyze' * 'analysis': string that represents the analysis as a concatenation like * "sommer,.N:msia +++ varmt,varm.A:nsio" * 'output_dela_line': string that contains the final DELA line. The lemma is * obtained by replacing the rightmost term of * the word to analyze by its lemma. * 'L': list of all analysis for the given word * 'number_of_components': number of components that compose the word. * 'infos': global settings. */ void explore_state(int offset,unichar* current_component,int pos_in_current_component, const unichar* word_to_analyze,int pos_in_word_to_analyze,const unichar* analysis, const unichar* output_dela_line,struct word_decomposition_list** L, int number_of_components,struct norwegian_infos* infos) { int c; int index,t; c=infos->bin[offset]*256+infos->bin[offset+1]; if (!(c&32768)) { /* If we are in a final state, we compute the index of the * corresponding INF line */ index=infos->bin[offset+2]*256*256+infos->bin[offset+3]*256+infos->bin[offset+4]; /* We can set the end of our current component */ current_component[pos_in_current_component]='\0'; /* We do not consider words of length 1 */ if (pos_in_current_component>1) { /* We don't consider components with a length of 1 */ if (word_to_analyze[pos_in_word_to_analyze]=='\0') { /* If we have explored the entire original word */ if (get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX) { /* And if we do not have forbidden word in last position */ struct list_ustring* l=infos->inf->codes[index]; /* We will look at all the INF codes of the last component in order * to produce analysis */ while (l!=NULL) { unichar dec[2000]; u_strcpy(dec,analysis); if (dec[0]!='\0') { /* If we have already something in the analysis (i.e. if * we have not a simple word), we insert the concatenation * mark before the entry to come */ u_strcat(dec," +++ "); } unichar entry[2000]; /* We get the dictionary line that corresponds to the current INF code */ uncompress_entry(current_component,l->string,entry); /* And we add it to the analysis */ u_strcat(dec,entry); unichar new_dela_line[2000]; /* We copy the current output DELA line that contains * the concatenation of the previous components */ u_strcpy(new_dela_line,output_dela_line); /* Then we tokenize the DELA line that corresponds the current INF * code in order to obtain its lemma and grammatical/inflectional * information */ struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1); /* We concatenate the inflected form of the last component to * the output DELA line */ u_strcat(new_dela_line,tmp_entry->inflected); /* We put the comma that separates the inflected form and the lemma */ u_strcat(new_dela_line,","); /* And we build the lemma in the same way than the inflected form */ u_strcat(new_dela_line,output_dela_line); u_strcat(new_dela_line,tmp_entry->lemma); /* We put the dot that separates the the lemma and the grammatical/inflectional * information */ u_strcat(new_dela_line,"."); /* And finally we put the grammatical/inflectional information */ u_strcat(new_dela_line,tmp_entry->semantic_codes[0]); int k; for (k=1;k<tmp_entry->n_semantic_codes;k++) { u_strcat(new_dela_line,"+"); u_strcat(new_dela_line,tmp_entry->semantic_codes[k]); } for (k=0;k<tmp_entry->n_inflectional_codes;k++) { u_strcat(new_dela_line,":"); u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]); } free_dela_entry(tmp_entry); /* * Now we can build an analysis in the form of a word decomposition * structure, but only if the last component is a valid * right one or if it is a verb long enough, or if we find out * that the word to analyze was in fact a simple word * in the dictionary */ if (verb_of_more_than_4_letters(entry) || check_valid_right_component_for_one_INF_code(l->string) || number_of_components==1) { /* * We set the number of components, the analysis, the actual * DELA line and information about */ struct word_decomposition* wd=new_word_decomposition(); wd->n_parts=number_of_components; u_strcpy(wd->decomposition,dec); u_strcpy(wd->dela_line,new_dela_line); wd->is_a_valid_right_N=check_N_right_component(l->string); wd->is_a_valid_right_A=check_A_right_component(l->string); /* Then we add the decomposition word structure to the list that * contains all the analysis for the word to analyze */ struct word_decomposition_list* wdl=new_word_decomposition_list(); wdl->element=wd; wdl->next=(*L); (*L)=wdl; } /* We go on with the next INF code of the last component */ l=l->next; } } /* If are at the end of the word to analyze, we have nothing more to do */ return; } else { /* If we are not at the end of the word to analyze, we must * 1) look if the current component is a valid left one * 2) look if it is not a forbidden component and * 3) explore the rest of the original word */ if (infos->valid_left_component[index] && (get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX)) { /* If we have a valid component, we look first if we are * in the case of a word ending by a double letter like "kupp" */ if (pos_in_current_component>2 && (current_component[pos_in_current_component-1]==current_component[pos_in_current_component-2])) { /* If we have such a word, we add it to the current analysis, * putting "+++" if the current component is not the first one */ unichar dec[2000]; u_strcpy(dec,analysis); if (dec[0]!='\0') { u_strcat(dec," +++ "); } /* In order to print the component in the analysis, we arbitrary * take a valid left component among all those that are available * for the current component */ unichar sia_code[2000]; unichar entry[2000]; unichar line[2000]; get_first_valid_left_component(infos->inf->codes[index],sia_code); uncompress_entry(current_component,sia_code,entry); u_strcat(dec,entry); u_strcpy(line,output_dela_line); u_strcat(line,current_component); /* As we have a double letter at the end of the word, * we must remove a character */ line[u_strlen(line)-1]='\0'; unichar temp[2000]; unichar dec_temp[2000]; u_strcpy(dec_temp,dec); /* Then, we explore the dictionary in order to analyze the * next component. We start at the root of the dictionary * (offset=4) and we go back one position in the word to analyze. * For instance, if we have "kupplaner", we read "kupp" and then * we try to analyze "planner". */ explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze-1, dec_temp,line,L,number_of_components+1,infos); } /* Now, we try to analyze the component normally, even if * it was ended by double letter, because we can have things * like "oppbrent = opp,.ADV +++ brent,brenne.V:K" */ unichar dec[2000]; unichar line[2000]; u_strcpy(dec,analysis); if (dec[0]!='\0') { /* We add the "+++" mark if the current component is not the first one */ u_strcat(dec," +++ "); } unichar sia_code[2000]; unichar entry[2000]; /* In order to print the component in the analysis, we arbitrary * take a valid left component among all those that are available * for the current component */ get_first_valid_left_component(infos->inf->codes[index],sia_code); uncompress_entry(current_component,sia_code,entry); u_strcat(dec,entry); u_strcpy(line,output_dela_line); u_strcat(line,current_component); unichar temp[2000]; unichar dec_temp[2000]; u_strcpy(dec_temp,dec); /* Then, we explore the dictionary in order to analyze the * next component. We start at the root of the dictionary * (offset=4). */ explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze, dec_temp,line,L,number_of_components+1,infos); } } } /* Once we have finished to deal with the current final dictionary node, * we go on because we may match a longer word */ t=offset+5; } else { /* If the node is not a final one, we get compute the number of transitions by * removing the highest bit */ c=c-32768; t=offset+2; } /* We examine each transition that goes out from the node */ for (int i=0;i<c;i++) { if (is_equal_or_uppercase((unichar)(infos->bin[t]*256+infos->bin[t+1]),word_to_analyze[pos_in_word_to_analyze],infos->alphabet)) { /* If the transition's letter is case compatible with the current letter of the * word to analyze, we follow it */ index=infos->bin[t+2]*256*256+infos->bin[t+3]*256+infos->bin[t+4]; current_component[pos_in_current_component]=(unichar)(infos->bin[t]*256+infos->bin[t+1]); explore_state(index,current_component,pos_in_current_component+1,word_to_analyze,pos_in_word_to_analyze+1, analysis,output_dela_line,L,number_of_components,infos); } /* We move the offset to the next transition */ t=t+5; } }
/** * Returns a control byte that represents the characteristics of the given token. */ unsigned char get_control_byte(const unichar* token,const Alphabet* alph,struct string_hash* err,TokenizationPolicy tokenization_policy) { int i; int tmp; unsigned char c=0; if (token==NULL || token[0]=='\0') { fatal_error("NULL or empty token in get_control_byte\n"); } /* We consider that a token starting with a letter is a word */ if (is_letter(token[0],alph)) { set_bit_mask(&c,MOT_TOKEN_BIT_MASK); /* If a token is a word, we check if it is in the 'err' word list * in order to answer the question <!DIC>. We perform this test in order * to avoid taking "priori" as an unknown word if the compound "a priori" * is in the text. */ if (err!=NULL && get_value_index(token,err,DONT_INSERT)!=-1) { set_bit_mask(&c,NOT_DIC_TOKEN_BIT_MASK); } if (is_upper(token[0],alph)) { set_bit_mask(&c,PRE_TOKEN_BIT_MASK); i=0; tmp=0; while (token[i]!='\0') { if (is_lower(token[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MAJ_TOKEN_BIT_MASK); } return c; } i=0; tmp=0; while (token[i]!='\0') { if (is_upper(token[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MIN_TOKEN_BIT_MASK); } return c; } /* If the token doesn't start with a letter, we start with * checking if it is a tag like {today,.ADV} */ if (token[0]=='{' && u_strcmp(token,"{S}") && u_strcmp(token,"{STOP}")) { /* Anyway, such a tag is classed as verifying <MOT> and <DIC> */ set_bit_mask(&c,MOT_TOKEN_BIT_MASK|DIC_TOKEN_BIT_MASK|TDIC_TOKEN_BIT_MASK); struct dela_entry* temp=tokenize_tag_token(token); if (is_upper(temp->inflected[0],alph)) { set_bit_mask(&c,PRE_TOKEN_BIT_MASK); i=0; tmp=0; while (temp->inflected[i]!='\0') { if (is_letter(temp->inflected[i],alph) && is_lower(temp->inflected[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MAJ_TOKEN_BIT_MASK); } } else { i=0; tmp=0; while (temp->inflected[i]!='\0') { if (is_letter(temp->inflected[i],alph) && is_upper(temp->inflected[i],alph)) { tmp=1; break; } i++; } if (!tmp) { set_bit_mask(&c,MIN_TOKEN_BIT_MASK); } } if (!is_a_simple_word(temp->inflected,tokenization_policy,alph)) { /* If the tag is a compound word, we say that it verifies the <CDIC> pattern */ set_bit_mask(&c,CDIC_TOKEN_BIT_MASK); } free_dela_entry(temp); } return c; }
// // this function explores the dictionary to decompose the word mot // void explore_state_german(int adresse,unichar* current_component,int pos_in_current_component, const unichar* original_word,int pos_in_original_word,const unichar* decomposition, unichar* dela_line,struct german_word_decomposition_list** L,int n_decomp, const char* left,const char* right, const struct INF_codes* inf_codes,const Alphabet* alphabet, const unsigned char* tableau_bin) { int c; int index,t; c=tableau_bin[adresse]*256+tableau_bin[adresse+1]; if (!(c&32768)) { // if we are in a terminal state index=tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4]; current_component[pos_in_current_component]='\0'; if (pos_in_current_component>1) { // we don't consider words with a length of 1 if (original_word[pos_in_original_word]=='\0') { // if we have explored the entire original word if (right[index]) { // and if we have a valid right component struct list_ustring* l=inf_codes->codes[index]; while (l!=NULL) { unichar dec[500]; u_strcpy(dec,decomposition); if (dec[0]!='\0') {u_strcat(dec," +++ ");} unichar entry[500]; uncompress_entry(current_component,l->string,entry); u_strcat(dec,entry); unichar new_dela_line[500]; struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1); if (tmp_entry==NULL) { /* If there was an error in the dictionary, we skip the entry */ l=l->next; continue; } // change case if there is a prefix // prefixes are downcase, nouns (=suffixes) uppercase: // "investitionsObjekte" -> "Investitionsobjekte" if ( u_strlen(dela_line) != 0 ) { // capitalize dela_line dela_line[0] = u_toupper((unichar) dela_line[0]); // downcase lemma and inflected tmp_entry->inflected[0] = u_tolower(tmp_entry->inflected[0]); tmp_entry->lemma[0] = u_tolower(tmp_entry->lemma[0]); } u_strcpy(new_dela_line,dela_line); u_strcat(new_dela_line,tmp_entry->inflected); u_strcat(new_dela_line,","); u_strcat(new_dela_line,dela_line); u_strcat(new_dela_line,tmp_entry->lemma); u_strcat(new_dela_line,"."); u_strcat(new_dela_line,tmp_entry->semantic_codes[0]); int k; for (k=1;k<tmp_entry->n_semantic_codes;k++) { u_strcat(new_dela_line,"+"); u_strcat(new_dela_line,tmp_entry->semantic_codes[k]); } for (k=0;k<tmp_entry->n_inflectional_codes;k++) { u_strcat(new_dela_line,":"); u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]); } free_dela_entry(tmp_entry); struct german_word_decomposition* wd=new_german_word_decomposition(); wd->n_parts=n_decomp; u_strcpy(wd->decomposition,dec); u_strcpy(wd->dela_line,new_dela_line); if (check_valid_right_component_for_one_INF_code_german(l->string)) { // if we got a correct right component (N-FF) struct german_word_decomposition_list* wdl=new_german_word_decomposition_list(); wdl->element=wd; wdl->suivant=(*L); (*L)=wdl; } else { free_german_word_decomposition(wd); } l=l->next; } } } else { // else, we must explore the rest of the original word if (left[index]) { // but only if the current component was a valid left one // we go on with the next component unichar dec[2000]; unichar line[500]; u_strcpy(dec,decomposition); if (dec[0]!='\0') {u_strcat(dec," +++ ");} unichar sia_code[500]; unichar entry[500]; get_first_sia_code_german(index,sia_code,inf_codes); uncompress_entry(current_component,sia_code,entry); u_strcat(dec,entry); u_strcpy(line,dela_line); u_strcat(line,current_component); unichar temp[500]; explore_state_german(4,temp,0,original_word,pos_in_original_word, dec,line,L,n_decomp+1,left,right,inf_codes,alphabet,tableau_bin); } } } t=adresse+5; } else { c=c-32768; t=adresse+2; } if (original_word[pos_in_original_word]=='\0') { // if we have finished, we return return; } // if not, we go on with the next letter for (int i=0;i<c;i++) { if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),original_word[pos_in_original_word],alphabet) || is_equal_or_uppercase(original_word[pos_in_original_word],(unichar)(tableau_bin[t]*256+tableau_bin[t+1]),alphabet)) { index=tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4]; current_component[pos_in_current_component]=(unichar)(tableau_bin[t]*256+tableau_bin[t+1]); explore_state_german(index,current_component,pos_in_current_component+1,original_word,pos_in_original_word+1, decomposition,dela_line,L,n_decomp,left,right,inf_codes,alphabet,tableau_bin); } t=t+5; } }
/** * Frees a single dic_variable. */ void free_dic_variable(struct dic_variable* v) { if (v==NULL) return; free(v->name); free_dela_entry(v->dic_entry); free(v); }
///////////////////////////////////////////////////////////////////////////////// // Inflect a DELAS/DELAC into a DELAF/DELACF. // On error returns 1, 0 otherwise. int inflect(char* DLC, char* DLCF, MultiFlex_ctx* p_multiFlex_ctx, struct l_morpho_t* pL_MORPHO, Alphabet* alph, Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input, int config_files_status, d_class_equiv_T* D_CLASS_EQUIV, int error_check_status, Korean* korean,const char* pkgdir) { U_FILE *dlc, *dlcf; //DELAS/DELAC and DELAF/DELACF files unichar input_line[DIC_LINE_SIZE]; //current DELAS/DELAC line unichar output_line[DIC_LINE_SIZE]; //current DELAF/DELACF line int l; //length of the line scanned DLC_entry_T* dlc_entry; MU_forms_T MU_forms; //inflected forms of the MWU int err; //Open DELAS/DELAC dlc = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input, DLC, U_READ); if (!dlc) { return 1; } //Open DELAF/DELACF dlcf = u_fopen_creating_versatile_encoding(encoding_output, bom_output, DLCF, U_WRITE); if (!dlcf) { error("Unable to open file: '%s' !\n", DLCF); return 1; } //Inflect one entry at a time l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc); //Omit the final newline u_chomp_new_line(input_line); int flag = 0; //If a line is empty the file is not necessarily finished. //If the last entry has no newline, we should not skip this entry struct dela_entry* DELAS_entry; int semitic; int current_line=0; while (l != EOF) { current_line++; DELAS_entry = is_strict_DELAS_line(input_line, alph); if (DELAS_entry != NULL) { /* If we have a strict DELAS line, that is to say, one with * a simple word */ if (error_check_status==ONLY_COMPOUND_WORDS) { error("Unexpected simple word forbidden by -c:\n%S\n",input_line); free_dela_entry(DELAS_entry); goto next_line; } SU_forms_T forms; SU_init_forms(&forms); //Allocate the space for forms and initialize it to null values char inflection_code[1024]; unichar code_gramm[1024]; /* We take the first grammatical code, and we extract from it the name * of the inflection transducer to use */ get_inflection_code(DELAS_entry->semantic_codes[0], inflection_code, code_gramm, &semitic); /* And we inflect the word */ // err=SU_inflect(DELAS_entry->lemma,inflection_code,&forms,semitic); err = SU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output,mask_encoding_compatibility_input,DELAS_entry->lemma, inflection_code, DELAS_entry->filters, &forms, semitic, korean,pkgdir); #ifdef __GNUC__ #warning mettre toutes les entrees sur une meme ligne #elif ((defined(__VISUALC__)) || defined(_MSC_VER)) #pragma message("warning : mettre toutes les entrees sur une meme ligne") #endif /* Then, we print its inflected forms to the output */ for (int i = 0; i < forms.no_forms; i++) { unichar foo[1024]; if (korean!=NULL) { Hanguls_to_Jamos(forms.forms[i].form,foo,korean,1); } else { u_strcpy(foo,forms.forms[i].form); } u_fprintf(dlcf, "%S,%S.%S", foo/*forms.forms[i].form*/, DELAS_entry->lemma, code_gramm); /* We add the semantic codes, if any */ for (int j = 1; j < DELAS_entry->n_semantic_codes; j++) { u_fprintf(dlcf, "+%S", DELAS_entry->semantic_codes[j]); } if (forms.forms[i].local_semantic_code != NULL) { u_fprintf(dlcf, "%S", forms.forms[i].local_semantic_code); } if (forms.forms[i].raw_features != NULL && forms.forms[i].raw_features[0] != '\0') { u_fprintf(dlcf, ":%S", forms.forms[i].raw_features); } u_fprintf(dlcf, "\n"); } SU_delete_inflection(&forms); free_dela_entry(DELAS_entry); /* End of simple word case */ } else { /* If we have not a simple word DELAS line, we try to analyse it * as a compound word DELAC line */ if (error_check_status==ONLY_SIMPLE_WORDS) { error("Unexpected compound word forbidden by -s:\n%S\n",input_line); goto next_line; } if (config_files_status != CONFIG_FILES_ERROR) { /* If this is a compound word, we process it if and only if the * configuration files have been correctly loaded */ dlc_entry = (DLC_entry_T*) malloc(sizeof(DLC_entry_T)); if (!dlc_entry) { fatal_alloc_error("inflect"); } /* Convert a DELAC entry into the internal multi-word format */ err = DLC_line2entry(alph,pL_MORPHO,input_line, dlc_entry, D_CLASS_EQUIV); if (!err) { //Inflect the entry MU_init_forms(&MU_forms); err = MU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output, mask_encoding_compatibility_input,dlc_entry->lemma, &MU_forms,pkgdir); if (!err) { int f; //index of the current inflected form //Inform the user if no form generated if (MU_forms.no_forms == 0) { error("No inflected form could be generated for "); DLC_print_entry(pL_MORPHO,dlc_entry); } //Print inflected forms for (f = 0; f < MU_forms.no_forms; f++) { //Format the inflected form to the DELACF format err = DLC_format_form(pL_MORPHO,output_line, DIC_LINE_SIZE - 1, MU_forms.forms[f], dlc_entry, D_CLASS_EQUIV); if (!err) { //Print one inflected form at a time to the DELACF file u_fprintf(dlcf, "%S\n", output_line); } } } MU_delete_inflection(&MU_forms); DLC_delete_entry(dlc_entry); } } else { /* We try to inflect a compound word whereas the "Morphology.txt" and/or * "Equivalences.txt" file(s) has/have not been loaded */ if (!flag) { /* We use a flag to print the error message only once */ error( "WARNING: Compound words won't be inflected because configuration files\n"); error(" have not been correctly loaded.\n"); flag = 1; } } } next_line: //Get next entry l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc); if (l!=EOF) { //Omit the final newline u_chomp_new_line(input_line); if (input_line[0]=='\0') { /* If we find an empty line, then we go on */ goto next_line; } } } u_fclose(dlc); u_fclose(dlcf); return 0; }
/** * Explores the node n, dumps the corresponding lines to the output file, * and then frees the node. 'pos' is the current position in the string 's'. */ int explore_node(struct sort_tree_node* n, struct sort_infos* inf, struct dela_entry* *last) { int i, N; struct sort_tree_transition* t = NULL; struct couple* couple = NULL; struct couple* tmp = NULL; if (n == NULL) { error("Internal error in explore_node\n"); return DEFAULT_ERROR_CODE; } if (n->couples != NULL) { /* If the node is a final one, we print the corresponding lines */ couple = n->couples; while (couple != NULL) { if (inf->factorize_inflectional_codes) { /* We look if the previously printed line, if any, did share * the same information. If so, we just append the new inflectional codes. * Otherwise, we print the new line. * * NOTE: in factorize mode, we always ignore duplicates */ int err; struct dela_entry* entry = tokenize_DELAF_line(couple->s,1,&err,0); if (entry==NULL) { /* We have a non DELAF entry line, like for instance a comment one */ if (*last!=NULL && *last!=(struct dela_entry*)-1) { /* If there was at least one line already printed, then this line * awaits for its \n */ u_fprintf(inf->f_out, "\n"); } /* Then we print the line */ u_fprintf(inf->f_out, "%S\n",couple->s); /* And we reset *last */ if (*last==(struct dela_entry*)-1) { *last=NULL; } else if (*last!=NULL) { free_dela_entry(*last); *last=NULL; } } else { /* So, we have a dic entry. Was there a previous one ? */ if (*last==NULL || *last==(struct dela_entry*)-1) { /* No ? So we print the line, and the current entry becomes *last */ u_fputs(couple->s, inf->f_out); *last=entry; } else { /* Yes ? We must compare if the codes are compatible */ if (are_compatible(*last,entry)) { /* We look for any code of entry if it was already in *last */ for (int j=0;j<entry->n_inflectional_codes;j++) { if (!dic_entry_contain_inflectional_code(*last,entry->inflectional_codes[j])) { u_fprintf(inf->f_out, ":%S",entry->inflectional_codes[j]); /* We also have to add the newly printed code to *last */ (*last)->inflectional_codes[((*last)->n_inflectional_codes)++]=u_strdup(entry->inflectional_codes[j]); } } /* And we must free entry */ free_dela_entry(entry); } else { /* If codes are not compatible, we print the \n for the previous * line, then the current line that becomes *last */ u_fprintf(inf->f_out, "\n%S",couple->s); free_dela_entry(*last); *last=entry; } } } } else { /* Normal way: we print each line one after the other */ for (i = 0; i < couple->n; i++) { u_fprintf(inf->f_out, "%S\n", couple->s); (inf->resulting_line_number)++; } } tmp = couple; couple = couple->next; free(tmp->s); free(tmp); } n->couples = NULL; } /* We convert the transition list into a sorted array */ t = n->transitions; N = 0; while (t != NULL && N < 0x10000) { inf->transitions[N++] = t; t = t->next; } if (N == 0x10000) { error("Internal error in explore_node: more than 0x10000 nodes\n"); free_sort_tree_node(n); return DEFAULT_ERROR_CODE; } if (N > 1) quicksort(inf->transitions, 0, N - 1, inf); /* After sorting, we copy the result into the transitions of n */ for (int j = 0; j < N - 1; j++) { inf->transitions[j]->next = inf->transitions[j + 1]; } if (N > 0) { inf->transitions[N - 1]->next = NULL; n->transitions = inf->transitions[0]; } /* Finally, we explore the outgoing transitions */ t = n->transitions; int explore_return_value = SUCCESS_RETURN_CODE; while (t != NULL && explore_return_value == SUCCESS_RETURN_CODE) { explore_return_value = explore_node(t->node, inf, last); if(explore_return_value == SUCCESS_RETURN_CODE) { t = t->next; } } /* And we free the node */ free_sort_tree_node(n); return explore_return_value; }
/** * This function loads a DLF or a DLC. It computes information about tokens * that will be used during the Locate operation. For instance, if we have the * following line: * * extended,.A * * and if the .fst2 to be applied to the text contains the pattern <A> with, * number 456, then the function will mark the "extended" token to be matched * by the pattern 456. Moreover, all case variations will be taken into account, * so that the "Extended" and "EXTENDED" tokens will also be updated. * * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern' * indicate if the .fst2 contains the corresponding patterns. For instance, if * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a * compound word must be marked as be matched by this pattern. */ void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet, int number_of_patterns,int is_DIC_pattern, int is_CDIC_pattern, struct lemma_node* root,struct locate_parameters* parameters) { struct string_hash* tokens=parameters->tokens; U_FILE* f; unichar line[DIC_LINE_SIZE]; f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ); if (f==NULL) { error("Cannot open dictionary %s\n",dic_name); return; } /* We parse all the lines */ int lines=0; char name[FILENAME_MAX]; remove_path(dic_name,name); while (EOF!=u_fgets(line,f)) { lines++; if (lines%10000==0) { u_printf("%s: %d lines loaded... \r",name,lines); } if (line[0]=='/') { /* NOTE: DLF and DLC files are not supposed to contain comment * lines, but we test them, just in the case */ continue; } struct dela_entry* entry=tokenize_DELAF_line(line,1); if (entry==NULL) { /* This case should never happen */ error("Invalid dictionary line in load_dic_for_locate\n"); continue; } /* We add the inflected form to the list of forms associated to the lemma. * This will be used to replace patterns like "<be>" by the actual list of * forms that can be matched by it, for optimization reasons */ add_inflected_form_for_lemma(entry->inflected,entry->lemma,root); /* We get the list of all tokens that can be matched by the inflected form of this * this entry, with regards to case variations (see the "extended" example above). */ struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens); /* We save the list pointer to free it later */ struct list_int* ptr_copy=ptr; /* Here, we will deal with all simple words */ while (ptr!=NULL) { int i=ptr->n; /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */ parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK); if (number_of_patterns) { /* We look for matching patterns only if there are some */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); if (list!=NULL) { /* If we have some patterns to add */ if (parameters->matching_patterns[i]==NULL) { /* We allocate the pattern bit array, if needed */ parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT); } struct list_pointer* tmp=list; while (tmp!=NULL) { /* Then we add all the pattern numbers to the bit array */ set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1); tmp=tmp->next; } /* Finally, we free the constraint list */ free_list_pointer(list); } } ptr=ptr->next; } /* Finally, we free the token list */ free_list_int(ptr_copy); if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) { /* If the inflected form is a compound word */ if (is_DIC_pattern || is_CDIC_pattern) { /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we * must note that all compound words can be matched by them */ add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); } if (number_of_patterns) { /* We look for matching patterns only if there are some */ /* We look if the compound word can be matched by some patterns */ struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root); struct list_pointer* tmp=list; while (tmp!=NULL) { /* If the word is matched by at least one pattern, we store it. */ int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number; add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy); tmp=tmp->next; } free_list_pointer(list); } } free_dela_entry(entry); } if (lines>10000) { u_printf("\n"); } u_fclose(f); }