/**
 * This function optimizes a pattern of the form "eat".
 */
void optimize_token_pattern(int i,Fst2Tag* tag,Alphabet* alph,
               struct locate_parameters* p,Abstract_allocator prv_alloc) {
/* Whatever happens, this pattern will be turned into a token list */
tag[i]->type=TOKEN_LIST_TAG;
unichar* opt_token=tag[i]->pattern->inflected;
/* First, we check if this token pattern can recognize some tag tokens */
struct list_int* list=p->tag_token_list;
while (list!=NULL) {
   struct dela_entry* entry=tokenize_tag_token(p->tokens->value[list->n],1);
   if ((!is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK) && is_equal_or_uppercase(opt_token,entry->inflected,alph)) ||
       !u_strcmp(opt_token,entry->inflected)) {
      tag[i]->matching_tokens=sorted_insert(list->n,tag[i]->matching_tokens,prv_alloc);
   }
   free_dela_entry(entry);
   list=list->next;
}
/* Then, we look for normal tokens */
if (is_bit_mask_set(tag[i]->control,RESPECT_CASE_TAG_BIT_MASK)) {
   /* If no case variants are allowed, then we just have to insert the number
    * of the token, but only if this token in the text ones. */
   int token_number;
   if (-1!=(token_number=get_value_index(opt_token,p->tokens,DONT_INSERT))) {
      tag[i]->matching_tokens=sorted_insert(token_number,tag[i]->matching_tokens,prv_alloc);
   }
   return;
}
/* Here, we have to get all the case variants of the token. */
tag[i]->matching_tokens=destructive_sorted_merge(get_token_list_for_sequence(opt_token,alph,p->tokens,prv_alloc),tag[i]->matching_tokens,prv_alloc);
}
/**
 * Prints the given hypotheses to the output, and if needed,
 * print the word to the modified input file.
 */
static void display_hypotheses(unichar* word,SpellCheckHypothesis* list,SpellCheckConfig* cfg) {
Ustring* line=new_Ustring(128);
int printed=0;
while (list!=NULL) {
	printed=1;
	struct dela_entry* entry=tokenize_DELAF_line(list->entry);
	if (entry==NULL) {
		fatal_error("Internal error in display_hypotheses; cannot tokenize entry:\n%S\n",list->entry);
	}
	unichar* inflected=entry->inflected;
	entry->inflected=u_strdup(word);
	entry->semantic_codes[entry->n_semantic_codes++]=u_strdup("SP_ERR");
	u_sprintf(line,"SP_INF=%S",inflected);
	entry->semantic_codes[entry->n_semantic_codes++]=u_strdup(line->str);
	dela_entry_to_string(line,entry);
	u_fprintf(cfg->out,"%S/score=%d\n",line->str,list->score);
	free(inflected);
	free_dela_entry(entry);
	list=list->next;
}
free_Ustring(line);
/* Now, we may have to print the word to the modified input file */
if (cfg->input_op=='M') {
	/* If we must keep matched words, then we print the word if it had matched */
	if (printed) u_fprintf(cfg->modified_input,"%S\n",word);
} else if (cfg->input_op=='U') {
	/* If we must keep unmatched words, then we print the word if it had matched */
	if (!printed) u_fprintf(cfg->modified_input,"%S\n",word);
}
}
/**
 * Frees all the memory associated to the given DELA entry list.
 */
void free_dela_entry_list(struct dela_entry_list* l) {
struct dela_entry_list* tmp;
while (l!=NULL) {
   tmp=l;
   l=l->next;
   free_dela_entry(tmp->entry);
   free(tmp);
}
}
void free_all_dic_entries (vector_ptr* entry_collection) {
   for (int i=0;i<entry_collection->nbelems;i++) {
      struct dela_entry* r=(struct dela_entry*)entry_collection->tab[i];
      free_dela_entry(r);
      /* We don't want the vector to be freed now, and we don't want free_vector_ptr to crash */
      entry_collection->tab[i]=NULL;
   }
   entry_collection->nbelems=0;
}
Beispiel #5
0
//
// returns 1 if the INF code refers to a valid right component, 0 else
//
char check_valid_right_component_for_one_INF_code_german(const unichar* s) {
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,s);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
char res=check_N_not_FF(d);
free_dela_entry(d);
return res;
}
Beispiel #6
0
/**
 * Saves the lines.
 */
void save(struct sort_infos* inf) {
  u_printf("Sorting and saving...\n");
  /* -1 means that no line at all was already printed */
  struct dela_entry* last = (struct dela_entry*)-1;
  int return_value = explore_node(inf->root, inf, &last);
  if (return_value == SUCCESS_RETURN_CODE && last != NULL && last!=(struct dela_entry*)-1) {
    u_fprintf(inf->f_out, "\n");
    free_dela_entry(last);
  }
}
Beispiel #7
0
/**
 * Returns 1 if the dictionary line refers to a verb with more than 4
 * letters and 0 otherwise.
 */
char verb_of_more_than_4_letters(unichar* line) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
struct dela_entry* d=tokenize_DELAF_line(line,0);
char res=check_V_but_not_Y(d) && u_strlen(d->inflected)>4;
/* We free the artifical dictionary entry */
free_dela_entry(d);
return res;
}
/**
 * Inserts the given entry in the given entry list, if not already present.
 * If the entry is already present, then it is freed.
 */
struct dela_entry_list* insert_if_not_present(struct dela_entry* entry,
                                              struct dela_entry_list* l) {
if (l==NULL) return new_dela_entry_list(entry,0);
if (equal(l->entry,entry)) {
   free_dela_entry(entry);
   return l;
}
l->next=insert_if_not_present(entry,l->next);
return l;
}
int check_is_valid_for_one_INF_code(const unichar* t, const unichar* s)
{
  unichar temp[MAX_DICT_LINE_LENGTH];
  u_strcpy(temp,"x,");
  u_strcat(temp,s);
  struct dela_entry* d = tokenize_DELAF_line(temp,0);
  int res = check_is_valid(t, d);
  free_dela_entry(d);
  return res;
}
Beispiel #10
0
/**
 * Returns 1 if the given INF code is a ":a" one.
 */
char check_a(unichar* INF_code) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,INF_code);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
char res=check_a(d);
/* We free the artifical dictionary entry */
free_dela_entry(d);
return res;
}
Beispiel #11
0
/**
 * Returns 1 if the INF code refers to a valid left component, 0 otherwise.
 */
char check_valid_right_component_for_one_INF_code(const unichar* INF_code) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,INF_code);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
char res=(check_N(d)||check_A(d)/*||check_V_but_not_Y(d)*/)&&(!check_Nsie(d));
/* We free the artifical dictionary entry */
free_dela_entry(d);
return res;
}
Beispiel #12
0
/**
 * Returns 1 if the INF code refers to a valid left component, 0 otherwise.
 */
char check_valid_left_component_for_one_INF_code(const unichar* INF_code) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,INF_code);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
/* Now, we can use this structured representation to check if the INF code
 * corresponds to a valid left component. */
char res=check_Nsia(d)||check_Nsie(d)||check_Nsig(d)||check_Asio(d)||check_Asie(d)||check_VW(d)||check_ADV(d);
/* Finally, we free the artificial dictionary entry */
free_dela_entry(d);
return res;
}
Beispiel #13
0
/**
 * Returns 1 if the line is a valid right "A" component.
 */
char check_A_right_component(unichar* s) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,s);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
unichar t1[2];
u_strcpy(t1,"A");
unichar t2[4];
u_strcpy(t2,"sie");
char res=dic_entry_contain_gram_code(d,t1) && !dic_entry_contain_inflectional_code(d,t2);
/* We free the artifical dictionary entry */
free_dela_entry(d);
return res;
}
/**
 * Loads the given DELAF and modifies the given keywords accordingly by
 * replacing any non removed token that appear in a DELAF entry
 * by its lemma. If there are ambiguities, several keywords are
 * generated. Doing that may merge keywords by adding their weights:
 * eats/2 + eaten/3 => eat/5
 */
void filter_keywords_with_dic(struct string_hash_ptr* keywords,char* name,
						VersatileEncodingConfig* vec,Alphabet* alphabet) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) {
	error("Cannot load file %s\n",name);
	return;
}
Ustring* line=new_Ustring(128);
while (EOF!=readline(line,f)) {
	struct dela_entry* e=tokenize_DELAF_line(line->str);
	if (e==NULL) continue;
	lemmatize(e,keywords,alphabet);
	free_dela_entry(e);
}
free_Ustring(line);
u_fclose(f);
}
Beispiel #15
0
/**
 * Sets the given dic variable, inserting it in the variable list if absent.
 */
void set_dic_variable(const unichar* name,struct dela_entry* dic_entry,struct dic_variable* *list,int must_clone) {
while (*list!=NULL) {
   if (!u_strcmp((*list)->name,name)) {
      /* If we have found the variable we were looking for */
      /* We have to free the previous value */
      free_dela_entry((*list)->dic_entry);
      if (must_clone) {
    	  (*list)->dic_entry=clone_dela_entry(dic_entry);
      } else {
    	  (*list)->dic_entry=dic_entry;
      }
      return;
   }
   list=&((*list)->next);
}
*list=new_dic_variable(name,dic_entry,NULL,must_clone);
}
Beispiel #16
0
/**
 * This function checks for each tag token like "{extended,extend.V:K}"
 * if it verifies some patterns. Its behaviour is very similar to the one
 * of the load_dic_for_locate function. However, as a side effect, this
 * function fills 'tag_token_list' with the list of tag token numbers.
 * This list is later used during Locate preprocessings.
 */
void check_patterns_for_tag_tokens(Alphabet* alphabet,int number_of_patterns,
                                   struct lemma_node* root,struct locate_parameters* parameters,Abstract_allocator prv_alloc) {
    struct string_hash* tokens=parameters->tokens;
    for (int i=0; i<tokens->size; i++) {
        if (tokens->value[i][0]=='{' && u_strcmp(tokens->value[i],"{S}")  && u_strcmp(tokens->value[i],"{STOP}")) {
            /* If the token is tag like "{today,.ADV}", we add its number to the tag token list */
            parameters->tag_token_list=head_insert(i,parameters->tag_token_list,prv_alloc);
            /* And we look for the patterns that can match it */
            struct dela_entry* entry=tokenize_tag_token(tokens->value[i]);
            if (entry==NULL) {
                /* This should never happen */
                fatal_error("Invalid tag token in function check_patterns_for_tag_tokens\n");
            }
            /* We add the inflected form to the list of forms associated to the lemma.
            * This will be used to replace patterns like "<be>" by the actual list of
            * forms that can be matched by it, for optimization reasons */
            add_inflected_form_for_lemma(tokens->value[i],entry->lemma,root);
            parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK);
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                if (list!=NULL) {
                    if (parameters->matching_patterns[i]==NULL) {
                        /* We allocate the bit array if needed */
                        parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT);
                    }
                    struct list_pointer* tmp=list;
                    while (tmp!=NULL) {
                        set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1);
                        tmp=tmp->next;
                    }
                    free_list_pointer(list);
                }
            }
            /* At the opposite of DLC lines, a compound word tag like "{all around,.ADV}"
             * does not need to be put in the compound word tree, since the tag is already
             * characterized by its token number. */
            free_dela_entry(entry);
        }
    }
}
Beispiel #17
0
/**
 * This function analyzes an INF code and returns a value that indicates
 * if it is a valid left component or not.
 */
int get_valid_left_component_type_for_one_INF_code(const unichar* INF_code) {
/* We produce an artifical dictionary entry with the given INF code,
 * and then, we tokenize it in order to get grammatical and inflectional
 * codes in a structured way. */
unichar temp[2000];
u_strcpy(temp,"x,");
u_strcat(temp,INF_code);
struct dela_entry* d=tokenize_DELAF_line(temp,0);
int res;
/* Now we can test if the INF code corresponds to a valid left component */
if (check_Nsia(d)) res=N_SIA;
else if (check_Nsie(d)) res=N_SIE;
else if (check_Nsig(d)) res=N_SIG;
else if (check_Asio(d)) res=A_SIO;
else if (check_Asie(d)) res=A_SIE;
else if (check_VW(d)) res=V_W;
else if (check_ADV(d)) res=ADV;
else res=INVALID_LEFT_COMPONENT;
/* Finally we free the artifical dictionary entry */
free_dela_entry(d);
return res;
}
Beispiel #18
0
/**
 * This explores the dictionary in order decompose the given word into a valid sequence
 * of simple words. For instance, if we have the word "Sommervarmt", we will first
 * explore the dictionary and find that "sommer" is a valid left component that
 * corresponds to the dictionary entry "sommer,.N:msia". Then we will
 * look if the following word "varmt" is in the dictionary. It is
 * the case, with the entry "varmt,varm.A:nsio". As we are at the end of the word to
 * analyze and as "varmt" is a valid rightmost component, we will generate an entry
 * according to the following things:
 *
 * 'output_dela_line'="sommervarmt,sommervarm.A:nsio"
 * 'analysis'="sommer,.N:msia +++ varmt,varm.A:nsio"
 * 'number_of_components'=2
 *
 * Note that the initial "S" was put in lowercase, because the dictionary
 * contains "sommer" and not "Sommer". The lemma is obtained with
 * the lemma of the rightmost component (here "varm"), and the word inherits
 * from the grammatical information of its rightmost component.
 *
 * 'offset': offset of the current node in the binary array 'infos->bin'
 * 'current_component': string that represents the current simple word
 * 'pos_in_current_component': position in the string 'current_component'
 * 'word_to_analyze': the word to analyze
 * 'pos_in_word_to_analyze': position in the string 'word_to_analyze'
 * 'analysis': string that represents the analysis as a concatenation like
 *             "sommer,.N:msia +++ varmt,varm.A:nsio"
 * 'output_dela_line': string that contains the final DELA line. The lemma is
 *                     obtained by replacing the rightmost term of
 *                     the word to analyze by its lemma.
 * 'L': list of all analysis for the given word
 * 'number_of_components': number of components that compose the word.
 * 'infos': global settings.
 */
void explore_state(int offset,unichar* current_component,int pos_in_current_component,
                   const unichar* word_to_analyze,int pos_in_word_to_analyze,const unichar* analysis,
                   const unichar* output_dela_line,struct word_decomposition_list** L,
                   int number_of_components,struct norwegian_infos* infos) {
int c;
int index,t;
c=infos->bin[offset]*256+infos->bin[offset+1];
if (!(c&32768)) {
	/* If we are in a final state, we compute the index of the
	 * corresponding INF line */
	index=infos->bin[offset+2]*256*256+infos->bin[offset+3]*256+infos->bin[offset+4];
	/* We can set the end of our current component */
	current_component[pos_in_current_component]='\0';
	/* We do not consider words of length 1 */
	if (pos_in_current_component>1) {
		/* We don't consider components with a length of 1 */
		if (word_to_analyze[pos_in_word_to_analyze]=='\0') {
			/* If we have explored the entire original word */
			if (get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX) {
				/* And if we do not have forbidden word in last position */
				struct list_ustring* l=infos->inf->codes[index];
				/* We will look at all the INF codes of the last component in order
				 * to produce analysis */
				while (l!=NULL) {
					unichar dec[2000];
					u_strcpy(dec,analysis);
					if (dec[0]!='\0') {
						/* If we have already something in the analysis (i.e. if
						 * we have not a simple word), we insert the concatenation
						 * mark before the entry to come */
						u_strcat(dec," +++ ");
					}
					unichar entry[2000];
					/* We get the dictionary line that corresponds to the current INF code */
					uncompress_entry(current_component,l->string,entry);
					/* And we add it to the analysis */
					u_strcat(dec,entry);
					unichar new_dela_line[2000];
					/* We copy the current output DELA line that contains
					 * the concatenation of the previous components */
					u_strcpy(new_dela_line,output_dela_line);
					/* Then we tokenize the DELA line that corresponds the current INF
					 * code in order to obtain its lemma and grammatical/inflectional
					 * information */
					struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1);
					/* We concatenate the inflected form of the last component to
					 * the output DELA line */
					u_strcat(new_dela_line,tmp_entry->inflected);
					/* We put the comma that separates the inflected form and the lemma */
					u_strcat(new_dela_line,",");
					/* And we build the lemma in the same way than the inflected form */
					u_strcat(new_dela_line,output_dela_line);
					u_strcat(new_dela_line,tmp_entry->lemma);
					/* We put the dot that separates the the lemma and the grammatical/inflectional
					 * information */
					u_strcat(new_dela_line,".");
					/* And finally we put the grammatical/inflectional information */
					u_strcat(new_dela_line,tmp_entry->semantic_codes[0]);
               int k;
               for (k=1;k<tmp_entry->n_semantic_codes;k++) {
                  u_strcat(new_dela_line,"+");
                  u_strcat(new_dela_line,tmp_entry->semantic_codes[k]);
               }
               for (k=0;k<tmp_entry->n_inflectional_codes;k++) {
                  u_strcat(new_dela_line,":");
                  u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]);
               }
					free_dela_entry(tmp_entry);
					/*
					 * Now we can build an analysis in the form of a word decomposition
					 * structure, but only if the last component is a valid
					 * right one or if it is a verb long enough, or if we find out
					 * that the word to analyze was in fact a simple word
					 * in the dictionary */
					if (verb_of_more_than_4_letters(entry)
						|| check_valid_right_component_for_one_INF_code(l->string)
						|| number_of_components==1) {
						/*
						 * We set the number of components, the analysis, the actual
						 * DELA line and information about
						 */
						struct word_decomposition* wd=new_word_decomposition();
						wd->n_parts=number_of_components;
						u_strcpy(wd->decomposition,dec);
						u_strcpy(wd->dela_line,new_dela_line);
						wd->is_a_valid_right_N=check_N_right_component(l->string);
						wd->is_a_valid_right_A=check_A_right_component(l->string);
						/* Then we add the decomposition word structure to the list that
						 * contains all the analysis for the word to analyze */
						struct word_decomposition_list* wdl=new_word_decomposition_list();
						wdl->element=wd;
						wdl->next=(*L);
						(*L)=wdl;
					}
					/* We go on with the next INF code of the last component */
					l=l->next;
				}
			}
			/* If are at the end of the word to analyze, we have nothing more to do */
			return;
		} else {
			/* If we are not at the end of the word to analyze, we must
			 * 1) look if the current component is a valid left one
			 * 2) look if it is not a forbidden component and
			 * 3) explore the rest of the original word
			 */
			if (infos->valid_left_component[index] &&
				(get_value_index(current_component,infos->forbidden_words,DONT_INSERT)==NO_VALUE_INDEX)) {
				/* If we have a valid component, we look first if we are
				 * in the case of a word ending by a double letter like "kupp" */
				if (pos_in_current_component>2 &&
					(current_component[pos_in_current_component-1]==current_component[pos_in_current_component-2])) {
					/* If we have such a word, we add it to the current analysis,
					 * putting "+++" if the current component is not the first one */
					unichar dec[2000];
					u_strcpy(dec,analysis);
					if (dec[0]!='\0') {
						u_strcat(dec," +++ ");
					}
					/* In order to print the component in the analysis, we arbitrary
					 * take a valid left component among all those that are available
					 * for the current component */
					unichar sia_code[2000];
					unichar entry[2000];
					unichar line[2000];
					get_first_valid_left_component(infos->inf->codes[index],sia_code);
					uncompress_entry(current_component,sia_code,entry);
					u_strcat(dec,entry);
					u_strcpy(line,output_dela_line);
					u_strcat(line,current_component);
					/* As we have a double letter at the end of the word,
					 * we must remove a character */
					line[u_strlen(line)-1]='\0';
					unichar temp[2000];
					unichar dec_temp[2000];
					u_strcpy(dec_temp,dec);
					/* Then, we explore the dictionary in order to analyze the
					 * next component. We start at the root of the dictionary
					 * (offset=4) and we go back one position in the word to analyze.
					 * For instance, if we have "kupplaner", we read "kupp" and then
					 * we try to analyze "planner". */
					explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze-1,
						dec_temp,line,L,number_of_components+1,infos);
				}
				/* Now, we try to analyze the component normally, even if
				 * it was ended by double letter, because we can have things
				 * like "oppbrent = opp,.ADV +++ brent,brenne.V:K" */
				unichar dec[2000];
				unichar line[2000];
				u_strcpy(dec,analysis);
				if (dec[0]!='\0') {
					/* We add the "+++" mark if the current component is not the first one */
					u_strcat(dec," +++ ");
				}
				unichar sia_code[2000];
				unichar entry[2000];
				/* In order to print the component in the analysis, we arbitrary
				 * take a valid left component among all those that are available
				 * for the current component */
				get_first_valid_left_component(infos->inf->codes[index],sia_code);
				uncompress_entry(current_component,sia_code,entry);
				u_strcat(dec,entry);
				u_strcpy(line,output_dela_line);
				u_strcat(line,current_component);
				unichar temp[2000];
				unichar dec_temp[2000];
				u_strcpy(dec_temp,dec);
				/* Then, we explore the dictionary in order to analyze the
				 * next component. We start at the root of the dictionary
				 * (offset=4). */
				explore_state(4,temp,0,word_to_analyze,pos_in_word_to_analyze,
					dec_temp,line,L,number_of_components+1,infos);
			}
		}
	}
	/* Once we have finished to deal with the current final dictionary node,
	 * we go on because we may match a longer word */
	t=offset+5;
}
else {
	/* If the node is not a final one, we get compute the number of transitions by
	 * removing the highest bit */
	c=c-32768;
	t=offset+2;
}
/* We examine each transition that goes out from the node */
for (int i=0;i<c;i++) {
	if (is_equal_or_uppercase((unichar)(infos->bin[t]*256+infos->bin[t+1]),word_to_analyze[pos_in_word_to_analyze],infos->alphabet)) {
		/* If the transition's letter is case compatible with the current letter of the
		 * word to analyze, we follow it */
		index=infos->bin[t+2]*256*256+infos->bin[t+3]*256+infos->bin[t+4];
		current_component[pos_in_current_component]=(unichar)(infos->bin[t]*256+infos->bin[t+1]);
		explore_state(index,current_component,pos_in_current_component+1,word_to_analyze,pos_in_word_to_analyze+1,
			analysis,output_dela_line,L,number_of_components,infos);
	}
	/* We move the offset to the next transition */
	t=t+5;
}
}
Beispiel #19
0
/**
 * Returns a control byte that represents the characteristics of the given token.
 */
unsigned char get_control_byte(const unichar* token,const Alphabet* alph,struct string_hash* err,TokenizationPolicy tokenization_policy) {
    int i;
    int tmp;
    unsigned char c=0;
    if (token==NULL || token[0]=='\0') {
        fatal_error("NULL or empty token in get_control_byte\n");
    }
    /* We consider that a token starting with a letter is a word */
    if (is_letter(token[0],alph)) {
        set_bit_mask(&c,MOT_TOKEN_BIT_MASK);
        /* If a token is a word, we check if it is in the 'err' word list
         * in order to answer the question <!DIC>. We perform this test in order
         * to avoid taking "priori" as an unknown word if the compound "a priori"
         * is in the text. */
        if (err!=NULL && get_value_index(token,err,DONT_INSERT)!=-1) {
            set_bit_mask(&c,NOT_DIC_TOKEN_BIT_MASK);
        }
        if (is_upper(token[0],alph)) {
            set_bit_mask(&c,PRE_TOKEN_BIT_MASK);
            i=0;
            tmp=0;
            while (token[i]!='\0') {
                if (is_lower(token[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MAJ_TOKEN_BIT_MASK);
            }
            return c;
        }
        i=0;
        tmp=0;
        while (token[i]!='\0') {
            if (is_upper(token[i],alph)) {
                tmp=1;
                break;
            }
            i++;
        }
        if (!tmp) {
            set_bit_mask(&c,MIN_TOKEN_BIT_MASK);
        }
        return c;
    }
    /* If the token doesn't start with a letter, we start with
     * checking if it is a tag like {today,.ADV} */
    if (token[0]=='{' && u_strcmp(token,"{S}") && u_strcmp(token,"{STOP}")) {
        /* Anyway, such a tag is classed as verifying <MOT> and <DIC> */
        set_bit_mask(&c,MOT_TOKEN_BIT_MASK|DIC_TOKEN_BIT_MASK|TDIC_TOKEN_BIT_MASK);
        struct dela_entry* temp=tokenize_tag_token(token);
        if (is_upper(temp->inflected[0],alph)) {
            set_bit_mask(&c,PRE_TOKEN_BIT_MASK);
            i=0;
            tmp=0;
            while (temp->inflected[i]!='\0') {
                if (is_letter(temp->inflected[i],alph) && is_lower(temp->inflected[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MAJ_TOKEN_BIT_MASK);
            }
        }
        else {
            i=0;
            tmp=0;
            while (temp->inflected[i]!='\0') {
                if (is_letter(temp->inflected[i],alph) && is_upper(temp->inflected[i],alph)) {
                    tmp=1;
                    break;
                }
                i++;
            }
            if (!tmp) {
                set_bit_mask(&c,MIN_TOKEN_BIT_MASK);
            }
        }
        if (!is_a_simple_word(temp->inflected,tokenization_policy,alph)) {
            /* If the tag is a compound word, we say that it verifies the <CDIC> pattern */
            set_bit_mask(&c,CDIC_TOKEN_BIT_MASK);
        }
        free_dela_entry(temp);
    }
    return c;
}
Beispiel #20
0
//
// this function explores the dictionary to decompose the word mot
//
void explore_state_german(int adresse,unichar* current_component,int pos_in_current_component,
                   const unichar* original_word,int pos_in_original_word,const unichar* decomposition,
                   unichar* dela_line,struct german_word_decomposition_list** L,int n_decomp,
                   const char* left,const char* right,
                   const struct INF_codes* inf_codes,const Alphabet* alphabet,
                   const unsigned char* tableau_bin) {
int c;
int index,t;
c=tableau_bin[adresse]*256+tableau_bin[adresse+1];
if (!(c&32768)) {
  // if we are in a terminal state
  index=tableau_bin[adresse+2]*256*256+tableau_bin[adresse+3]*256+tableau_bin[adresse+4];
  current_component[pos_in_current_component]='\0';
  if (pos_in_current_component>1) {
    // we don't consider words with a length of 1
    if (original_word[pos_in_original_word]=='\0') {
      // if we have explored the entire original word
      if (right[index]) {
         // and if we have a valid right component
         struct list_ustring* l=inf_codes->codes[index];
         while (l!=NULL) {
            unichar dec[500];
            u_strcpy(dec,decomposition);
            if (dec[0]!='\0') {u_strcat(dec," +++ ");}
            unichar entry[500];
            uncompress_entry(current_component,l->string,entry);
            u_strcat(dec,entry);
            unichar new_dela_line[500];
            struct dela_entry* tmp_entry=tokenize_DELAF_line(entry,1);
            if (tmp_entry==NULL) {
               /* If there was an error in the dictionary, we skip the entry */
               l=l->next;
               continue;
            }
            // change case if there is a prefix
            // prefixes are downcase, nouns (=suffixes) uppercase:
            // "investitionsObjekte" -> "Investitionsobjekte"
            if ( u_strlen(dela_line) != 0 ) {
              // capitalize dela_line
              dela_line[0] = u_toupper((unichar) dela_line[0]);
              // downcase lemma and inflected
              tmp_entry->inflected[0] = u_tolower(tmp_entry->inflected[0]);
              tmp_entry->lemma[0] = u_tolower(tmp_entry->lemma[0]);
            }
            u_strcpy(new_dela_line,dela_line);
            u_strcat(new_dela_line,tmp_entry->inflected);
            u_strcat(new_dela_line,",");
            u_strcat(new_dela_line,dela_line);
            u_strcat(new_dela_line,tmp_entry->lemma);
            u_strcat(new_dela_line,".");
            u_strcat(new_dela_line,tmp_entry->semantic_codes[0]);
            int k;
            for (k=1;k<tmp_entry->n_semantic_codes;k++) {
               u_strcat(new_dela_line,"+");
               u_strcat(new_dela_line,tmp_entry->semantic_codes[k]);
            }
            for (k=0;k<tmp_entry->n_inflectional_codes;k++) {
               u_strcat(new_dela_line,":");
               u_strcat(new_dela_line,tmp_entry->inflectional_codes[k]);
            }
            free_dela_entry(tmp_entry);
            struct german_word_decomposition* wd=new_german_word_decomposition();
            wd->n_parts=n_decomp;
            u_strcpy(wd->decomposition,dec);
            u_strcpy(wd->dela_line,new_dela_line);
            if (check_valid_right_component_for_one_INF_code_german(l->string)) {
               // if we got a correct right component (N-FF)
               struct german_word_decomposition_list* wdl=new_german_word_decomposition_list();
               wdl->element=wd;
               wdl->suivant=(*L);
               (*L)=wdl;
            } else {
               free_german_word_decomposition(wd);
            }
            l=l->next;
         }
      }
    }
    else {
      // else, we must explore the rest of the original word
      if (left[index]) {
         // but only if the current component was a valid left one
         // we go on with the next component
         unichar dec[2000];
         unichar line[500];
         u_strcpy(dec,decomposition);
         if (dec[0]!='\0') {u_strcat(dec," +++ ");}
         unichar sia_code[500];
         unichar entry[500];
         get_first_sia_code_german(index,sia_code,inf_codes);
         uncompress_entry(current_component,sia_code,entry);
         u_strcat(dec,entry);
         u_strcpy(line,dela_line);
         u_strcat(line,current_component);
         unichar temp[500];
         explore_state_german(4,temp,0,original_word,pos_in_original_word,
                  dec,line,L,n_decomp+1,left,right,inf_codes,alphabet,tableau_bin);
      }
    }
  }
  t=adresse+5;
}
else {
  c=c-32768;
  t=adresse+2;
}
if (original_word[pos_in_original_word]=='\0') {
   // if we have finished, we return
   return;
}
// if not, we go on with the next letter
for (int i=0;i<c;i++) {
  if (is_equal_or_uppercase((unichar)(tableau_bin[t]*256+tableau_bin[t+1]),original_word[pos_in_original_word],alphabet)
      || is_equal_or_uppercase(original_word[pos_in_original_word],(unichar)(tableau_bin[t]*256+tableau_bin[t+1]),alphabet)) {
    index=tableau_bin[t+2]*256*256+tableau_bin[t+3]*256+tableau_bin[t+4];
    current_component[pos_in_current_component]=(unichar)(tableau_bin[t]*256+tableau_bin[t+1]);
    explore_state_german(index,current_component,pos_in_current_component+1,original_word,pos_in_original_word+1,
                  decomposition,dela_line,L,n_decomp,left,right,inf_codes,alphabet,tableau_bin);
  }
  t=t+5;
}
}
Beispiel #21
0
/**
 * Frees a single dic_variable.
 */
void free_dic_variable(struct dic_variable* v) {
if (v==NULL) return;
free(v->name);
free_dela_entry(v->dic_entry);
free(v);
}
Beispiel #22
0
/////////////////////////////////////////////////////////////////////////////////
// Inflect a DELAS/DELAC into a DELAF/DELACF.
// On error returns 1, 0 otherwise.
int inflect(char* DLC, char* DLCF, 
		    MultiFlex_ctx* p_multiFlex_ctx, struct l_morpho_t* pL_MORPHO, Alphabet* alph,
		    Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input,
		    int config_files_status,
		    d_class_equiv_T* D_CLASS_EQUIV, int error_check_status,
		    Korean* korean,const char* pkgdir) {
	U_FILE *dlc, *dlcf; //DELAS/DELAC and DELAF/DELACF files
	unichar input_line[DIC_LINE_SIZE]; //current DELAS/DELAC line
	unichar output_line[DIC_LINE_SIZE]; //current DELAF/DELACF line
	int l; //length of the line scanned
	DLC_entry_T* dlc_entry;
	MU_forms_T MU_forms; //inflected forms of the MWU
	int err;

	//Open DELAS/DELAC
	dlc = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input, DLC, U_READ);
	if (!dlc) {
		return 1;
	}
	//Open DELAF/DELACF
	dlcf = u_fopen_creating_versatile_encoding(encoding_output, bom_output, DLCF, U_WRITE);
	if (!dlcf) {
		error("Unable to open file: '%s' !\n", DLCF);
		return 1;
	}
	//Inflect one entry at a time
	l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc);
	//Omit the final newline
	u_chomp_new_line(input_line);
	int flag = 0;
	//If a line is empty the file is not necessarily finished.
	//If the last entry has no newline, we should not skip this entry
	struct dela_entry* DELAS_entry;
	int semitic;
	int current_line=0;
	while (l != EOF) {
	   current_line++;
		DELAS_entry = is_strict_DELAS_line(input_line, alph);
		if (DELAS_entry != NULL) {
			/* If we have a strict DELAS line, that is to say, one with
			 * a simple word */
			if (error_check_status==ONLY_COMPOUND_WORDS) {
				error("Unexpected simple word forbidden by -c:\n%S\n",input_line);
				free_dela_entry(DELAS_entry);
				goto next_line;
			}
			SU_forms_T forms;
			SU_init_forms(&forms); //Allocate the space for forms and initialize it to null values
			char inflection_code[1024];
			unichar code_gramm[1024];
			/* We take the first grammatical code, and we extract from it the name
			 * of the inflection transducer to use */
			get_inflection_code(DELAS_entry->semantic_codes[0],
					inflection_code, code_gramm, &semitic);
			/* And we inflect the word */
			//   err=SU_inflect(DELAS_entry->lemma,inflection_code,&forms,semitic);
			err = SU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output,mask_encoding_compatibility_input,DELAS_entry->lemma, inflection_code,
					DELAS_entry->filters, &forms, semitic, korean,pkgdir);
#ifdef __GNUC__
#warning mettre toutes les entrees sur une meme ligne
#elif ((defined(__VISUALC__)) || defined(_MSC_VER))
#pragma message("warning : mettre toutes les entrees sur une meme ligne")
#endif
			/* Then, we print its inflected forms to the output */
			for (int i = 0; i < forms.no_forms; i++) {
			   
			   unichar foo[1024];   
			   if (korean!=NULL) {
			      Hanguls_to_Jamos(forms.forms[i].form,foo,korean,1);
			   } else {
			      u_strcpy(foo,forms.forms[i].form);
			   }
			   
			   u_fprintf(dlcf, "%S,%S.%S", foo/*forms.forms[i].form*/,
						DELAS_entry->lemma, code_gramm);
				/* We add the semantic codes, if any */
				for (int j = 1; j < DELAS_entry->n_semantic_codes; j++) {
					u_fprintf(dlcf, "+%S", DELAS_entry->semantic_codes[j]);
				}
				if (forms.forms[i].local_semantic_code != NULL) {
					u_fprintf(dlcf, "%S", forms.forms[i].local_semantic_code);
				}
				if (forms.forms[i].raw_features != NULL
						&& forms.forms[i].raw_features[0] != '\0') {
					u_fprintf(dlcf, ":%S", forms.forms[i].raw_features);
				}
				u_fprintf(dlcf, "\n");
			}
			SU_delete_inflection(&forms);
			free_dela_entry(DELAS_entry);
			/* End of simple word case */
		} else {
			/* If we have not a simple word DELAS line, we try to analyse it
			 * as a compound word DELAC line */
			if (error_check_status==ONLY_SIMPLE_WORDS) {
				error("Unexpected compound word forbidden by -s:\n%S\n",input_line);
				goto next_line;
			}
			if (config_files_status != CONFIG_FILES_ERROR) {
				/* If this is a compound word, we process it if and only if the
				 * configuration files have been correctly loaded */
				dlc_entry = (DLC_entry_T*) malloc(sizeof(DLC_entry_T));
				if (!dlc_entry) {
					fatal_alloc_error("inflect");
				}
				/* Convert a DELAC entry into the internal multi-word format */
				err = DLC_line2entry(alph,pL_MORPHO,input_line, dlc_entry, D_CLASS_EQUIV);
				if (!err) {
					//Inflect the entry
					MU_init_forms(&MU_forms);
					err = MU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output,
							mask_encoding_compatibility_input,dlc_entry->lemma, &MU_forms,pkgdir);
					if (!err) {
						int f; //index of the current inflected form
						//Inform the user if no form generated
						if (MU_forms.no_forms == 0) {
							error("No inflected form could be generated for ");
							DLC_print_entry(pL_MORPHO,dlc_entry);
						}
						//Print inflected forms
						for (f = 0; f < MU_forms.no_forms; f++) {
							//Format the inflected form to the DELACF format
							err = DLC_format_form(pL_MORPHO,output_line, DIC_LINE_SIZE
									- 1, MU_forms.forms[f], dlc_entry,
									D_CLASS_EQUIV);
							if (!err) {
								//Print one inflected form at a time to the DELACF file
								u_fprintf(dlcf, "%S\n", output_line);
							}
						}
					}
					MU_delete_inflection(&MU_forms);
					DLC_delete_entry(dlc_entry);
				}
			} else {
				/* We try to inflect a compound word whereas the "Morphology.txt" and/or
				 * "Equivalences.txt" file(s) has/have not been loaded */
				if (!flag) {
					/* We use a flag to print the error message only once */
					error(
							"WARNING: Compound words won't be inflected because configuration files\n");
					error("         have not been correctly loaded.\n");
					flag = 1;
				}
			}
		}
		next_line:
		//Get next entry
		l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc);
		if (l!=EOF) {
			//Omit the final newline
			u_chomp_new_line(input_line);
			if (input_line[0]=='\0') {
				/* If we find an empty line, then we go on */
				goto next_line;
			}
		}
	}
	u_fclose(dlc);
	u_fclose(dlcf);
	return 0;
}
Beispiel #23
0
/**
 * Explores the node n, dumps the corresponding lines to the output file,
 * and then frees the node. 'pos' is the current position in the string 's'.
 */
int explore_node(struct sort_tree_node* n, struct sort_infos* inf,
    struct dela_entry* *last) {
  int i, N;
  struct sort_tree_transition* t = NULL;
  struct couple* couple = NULL;
  struct couple* tmp    = NULL;
  if (n == NULL) {
    error("Internal error in explore_node\n");
    return DEFAULT_ERROR_CODE;
  }
  if (n->couples != NULL) {
    /* If the node is a final one, we print the corresponding lines */
    couple = n->couples;
    while (couple != NULL) {
      if (inf->factorize_inflectional_codes) {
        /* We look if the previously printed line, if any, did share
         * the same information. If so, we just append the new inflectional codes.
         * Otherwise, we print the new line.
         *
         * NOTE: in factorize mode, we always ignore duplicates */
        int err;
        struct dela_entry* entry = tokenize_DELAF_line(couple->s,1,&err,0);
        if (entry==NULL) {
          /* We have a non DELAF entry line, like for instance a comment one */
          if (*last!=NULL && *last!=(struct dela_entry*)-1) {
            /* If there was at least one line already printed, then this line
             * awaits for its \n */
            u_fprintf(inf->f_out, "\n");
          }
          /* Then we print the line */
          u_fprintf(inf->f_out, "%S\n",couple->s);
          /* And we reset *last */
          if (*last==(struct dela_entry*)-1) {
            *last=NULL;
          } else if (*last!=NULL) {
            free_dela_entry(*last);
            *last=NULL;
          }
        } else {
          /* So, we have a dic entry. Was there a previous one ? */
          if (*last==NULL || *last==(struct dela_entry*)-1) {
            /* No ? So we print the line, and the current entry becomes *last */
            u_fputs(couple->s, inf->f_out);
            *last=entry;
          } else {
            /* Yes ? We must compare if the codes are compatible */
            if (are_compatible(*last,entry)) {
              /* We look for any code of entry if it was already in *last */
              for (int j=0;j<entry->n_inflectional_codes;j++) {
                if (!dic_entry_contain_inflectional_code(*last,entry->inflectional_codes[j])) {
                  u_fprintf(inf->f_out, ":%S",entry->inflectional_codes[j]);
                  /* We also have to add the newly printed code to *last */
                  (*last)->inflectional_codes[((*last)->n_inflectional_codes)++]=u_strdup(entry->inflectional_codes[j]);
                }
              }
              /* And we must free entry */
              free_dela_entry(entry);
            } else {
              /* If codes are not compatible, we print the \n for the previous
               * line, then the current line that becomes *last */
              u_fprintf(inf->f_out, "\n%S",couple->s);
              free_dela_entry(*last);
              *last=entry;
            }
          }
        }
      } else {
        /* Normal way: we print each line one after the other */
        for (i = 0; i < couple->n; i++) {
          u_fprintf(inf->f_out, "%S\n", couple->s);
          (inf->resulting_line_number)++;
        }
      }
      tmp = couple;
      couple = couple->next;
      free(tmp->s);
      free(tmp);
    }
    n->couples = NULL;
  }
  /* We convert the transition list into a sorted array */
  t = n->transitions;
  N = 0;
  while (t != NULL && N < 0x10000) {
    inf->transitions[N++] = t;
    t = t->next;
  }
  if (N == 0x10000) {
    error("Internal error in explore_node: more than 0x10000 nodes\n");
    free_sort_tree_node(n);
    return DEFAULT_ERROR_CODE;
  }
  if (N > 1)
    quicksort(inf->transitions, 0, N - 1, inf);
  /* After sorting, we copy the result into the transitions of n */
  for (int j = 0; j < N - 1; j++) {
    inf->transitions[j]->next = inf->transitions[j + 1];
  }
  if (N > 0) {
    inf->transitions[N - 1]->next = NULL;
    n->transitions = inf->transitions[0];
  }
  /* Finally, we explore the outgoing transitions */
  t = n->transitions;
  int explore_return_value = SUCCESS_RETURN_CODE;

  while (t != NULL && explore_return_value == SUCCESS_RETURN_CODE) {
    explore_return_value = explore_node(t->node, inf, last);
    if(explore_return_value == SUCCESS_RETURN_CODE) {
      t = t->next;
    }
  }

  /* And we free the node */
  free_sort_tree_node(n);
  return explore_return_value;
}
Beispiel #24
0
/**
 * This function loads a DLF or a DLC. It computes information about tokens
 * that will be used during the Locate operation. For instance, if we have the
 * following line:
 *
 *   extended,.A
 *
 * and if the .fst2 to be applied to the text contains the pattern <A> with,
 * number 456, then the function will mark the "extended" token to be matched
 * by the pattern 456. Moreover, all case variations will be taken into account,
 * so that the "Extended" and "EXTENDED" tokens will also be updated.
 *
 * The two parameters 'is_DIC_pattern' and 'is_CDIC_pattern'
 * indicate if the .fst2 contains the corresponding patterns. For instance, if
 * the pattern "<CDIC>" is used in the grammar, it means that any token sequence that is a
 * compound word must be marked as be matched by this pattern.
 */
void load_dic_for_locate(const char* dic_name,int mask_encoding_compatibility_input,Alphabet* alphabet,
                         int number_of_patterns,int is_DIC_pattern,
                         int is_CDIC_pattern,
                         struct lemma_node* root,struct locate_parameters* parameters) {
    struct string_hash* tokens=parameters->tokens;
    U_FILE* f;
    unichar line[DIC_LINE_SIZE];
    f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,dic_name,U_READ);
    if (f==NULL) {
        error("Cannot open dictionary %s\n",dic_name);
        return;
    }
    /* We parse all the lines */
    int lines=0;
    char name[FILENAME_MAX];
    remove_path(dic_name,name);
    while (EOF!=u_fgets(line,f)) {
        lines++;
        if (lines%10000==0) {
            u_printf("%s: %d lines loaded...                          \r",name,lines);
        }
        if (line[0]=='/') {
            /* NOTE: DLF and DLC files are not supposed to contain comment
             *       lines, but we test them, just in the case */
            continue;
        }
        struct dela_entry* entry=tokenize_DELAF_line(line,1);
        if (entry==NULL) {
            /* This case should never happen */
            error("Invalid dictionary line in load_dic_for_locate\n");
            continue;
        }
        /* We add the inflected form to the list of forms associated to the lemma.
         * This will be used to replace patterns like "<be>" by the actual list of
         * forms that can be matched by it, for optimization reasons */
        add_inflected_form_for_lemma(entry->inflected,entry->lemma,root);
        /* We get the list of all tokens that can be matched by the inflected form of this
         * this entry, with regards to case variations (see the "extended" example above). */
        struct list_int* ptr=get_token_list_for_sequence(entry->inflected,alphabet,tokens);
        /* We save the list pointer to free it later */
        struct list_int* ptr_copy=ptr;
        /* Here, we will deal with all simple words */
        while (ptr!=NULL) {
            int i=ptr->n;
            /* If the current token can be matched, then it can be recognized by the "<DIC>" pattern */
            parameters->token_control[i]=(unsigned char)(get_control_byte(tokens->value[i],alphabet,NULL,parameters->tokenization_policy)|DIC_TOKEN_BIT_MASK);
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                if (list!=NULL) {
                    /* If we have some patterns to add */
                    if (parameters->matching_patterns[i]==NULL) {
                        /* We allocate the pattern bit array, if needed */
                        parameters->matching_patterns[i]=new_bit_array(number_of_patterns,ONE_BIT);
                    }
                    struct list_pointer* tmp=list;
                    while (tmp!=NULL) {
                        /* Then we add all the pattern numbers to the bit array */
                        set_value(parameters->matching_patterns[i],((struct constraint_list*)(tmp->pointer))->pattern_number,1);
                        tmp=tmp->next;
                    }
                    /* Finally, we free the constraint list */
                    free_list_pointer(list);
                }
            }
            ptr=ptr->next;
        }
        /* Finally, we free the token list */
        free_list_int(ptr_copy);
        if (!is_a_simple_word(entry->inflected,parameters->tokenization_policy,alphabet)) {
            /* If the inflected form is a compound word */
            if (is_DIC_pattern || is_CDIC_pattern) {
                /* If the .fst2 contains "<DIC>" and/or "<CDIC>", then we
                 * must note that all compound words can be matched by them */
                add_compound_word_with_no_pattern(entry->inflected,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
            }
            if (number_of_patterns) {
                /* We look for matching patterns only if there are some */
                /* We look if the compound word can be matched by some patterns */
                struct list_pointer* list=get_matching_patterns(entry,parameters->pattern_tree_root);
                struct list_pointer* tmp=list;
                while (tmp!=NULL) {
                    /* If the word is matched by at least one pattern, we store it. */
                    int pattern_number=((struct constraint_list*)(tmp->pointer))->pattern_number;
                    add_compound_word_with_pattern(entry->inflected,pattern_number,alphabet,tokens,parameters->DLC_tree,parameters->tokenization_policy);
                    tmp=tmp->next;
                }
                free_list_pointer(list);
            }
        }
        free_dela_entry(entry);
    }
    if (lines>10000) {
        u_printf("\n");
    }
    u_fclose(f);
}