/**
 * Allocates, initializes and returns a compound word tree node.
 */
struct DLC_tree_node* new_DLC_tree_node() {
struct DLC_tree_node* n;
n=(struct DLC_tree_node*)malloc(sizeof(struct DLC_tree_node));
if (n==NULL) {
	fatal_alloc_error("new_DLC_tree_node");
}
n->patterns=NULL;
n->number_of_patterns=0;
n->count_reference=1;
n->array_of_patterns=NULL;
n->transitions=NULL;
n->number_of_transitions=0;
n->destination_tokens=NULL;
n->destination_nodes=NULL;
return n;
}
Example #2
0
/**
 * This function clones the given element. Note that we increase the 'pointed_by' field
 * of the tfst_match.
 */
struct tfst_simple_match_list* new_tfst_simple_match_list(struct tfst_simple_match_list* e,
		                                                  struct tfst_simple_match_list* next) {
struct tfst_simple_match_list* m=(struct tfst_simple_match_list*)malloc(sizeof(struct tfst_simple_match_list));
if (m==NULL) {
	fatal_alloc_error("new_tfst_simple_match_list");
}
memcpy(m,e,sizeof(struct tfst_simple_match_list));
if (m->output!=NULL) {
	/* If there was an output, we have to clone it */
	m->output=u_strdup(m->output);
}
if (m->match!=NULL) {
	(m->match->pointed_by)++;
}
m->next=next;
return m;
}
struct composition_rule* new_composition_rule () {
  struct composition_rule* tmp
    = (struct composition_rule*)malloc(sizeof(struct composition_rule));
  if (tmp==NULL) {
     fatal_alloc_error("new_composition_rule");
  }
  tmp->before[0].string[0] = '\0';
  tmp->after[0].string[0] = '\0';
  tmp->then.add[0]    = '\0';
  tmp->then.del[0]    = '\0';
  tmp->then.repl[0]   = '\0';
  tmp->then.substr_act[0]   = '\0';
  tmp->then.substr_next[0]   = '\0';
  tmp->then.undo_substr_act[0]   = '\0';
  tmp->then.undo_substr_next[0]   = '\0';
  return tmp;
}
Example #4
0
/**
 * Adds the given DELA entry to the given tree. If the entry is already
 * present in the tree, then it is freed. Otherwise, it is put in the tree
 * so that IT MUST NOT BE FREED!
 */
void add_entry(struct DELA_tree* tree,struct dela_entry* entry) {
int n=get_value_index(entry->inflected,tree->inflected_forms);
if (n==tree->size) {
   /* If there was no entry list for the given inflected form */
   if (n==tree->capacity) {
      /* If we must double the array capacity */
      tree->capacity=2*tree->capacity;
      tree->dela_entries=(struct dela_entry_list**)realloc(tree->dela_entries,tree->capacity*sizeof(struct dela_entry_list*));
      if (tree->dela_entries==NULL) {
         fatal_alloc_error("add_entry");
      }
   }
   tree->dela_entries[n]=NULL;
   (tree->size)++;
}
tree->dela_entries[n]=insert_if_not_present(entry,tree->dela_entries[n]);
}
Example #5
0
/**
 * Adds a value in the value array without associating it with a unicode string.
 * Returns the index of this value.
 */
int add_value(void* value,struct string_hash_ptr* hash) {
if (hash->capacity==DONT_USE_VALUES) {
   fatal_error("Value array doesn't exist in add_value\n");
}
(hash->hash->size)++;
hash->size=hash->hash->size;
int index=hash->size;
if (hash->hash->size==hash->capacity) {
   /* We enlarge the 'value' array, doubling its capacity */
   hash->capacity=2*hash->capacity;
   hash->value=(void**)realloc(hash->value,sizeof(void*)*hash->capacity);
   if (hash->value==NULL) {
      fatal_alloc_error("add_value");
   }
}
hash->value[index]=value;
return index;
}
Example #6
0
/**
 * This function concatenates B at the end of A. A is modified.
 */
void elag_concat(language_t* language,SingleGraph A,SingleGraph B) {
int oldnb=A->number_of_states;
int* renumber=(int*)malloc(B->number_of_states*sizeof(int));
if (renumber==NULL) {
   fatal_alloc_error("elag_concat");
}
int q;
/* We copy the states of B into A */
for (q=0;q<B->number_of_states;q++) {
   renumber[q]=A->number_of_states;
   add_state(A);
}
for (q=0;q<B->number_of_states;q++) {
   A->states[renumber[q]]->outgoing_transitions=clone_transition_list(B->states[q]->outgoing_transitions,renumber,dup_symbol);
   A->states[renumber[q]]->default_state=(B->states[q]->default_state!=-1)?renumber[B->states[q]->default_state]:-1;
   if (is_final_state(B->states[q])) {
      set_final_state(A->states[renumber[q]]);
   }
}
/* Then, we concatenate A and B.
 * 1) We replace default transitions that outgo from B's initial states
 *    by explicit transitions */
struct list_int* initials=get_initial_states(B);
for (struct list_int* tmp=initials;tmp!=NULL;tmp=tmp->next) {
   explicit_default_transition(language,A,renumber[tmp->n]);
}
for (q=0;q<oldnb;q++) {
   if (is_final_state(A->states[q])) {
      /* Each final state of A becomes non final. Moreover, we have
       * to explicit its default transition, because if not, the concatenation
       * algorithm will modify the recognized language. */
      unset_final_state(A->states[q]);
      explicit_default_transition(language,A,q);
      for (struct list_int* tmp=initials;tmp!=NULL;tmp=tmp->next) {
         concat(&(A->states[q]->outgoing_transitions),clone_transition_list(A->states[renumber[tmp->n]]->outgoing_transitions,NULL,dup_symbol));
         if (is_final_state(A->states[renumber[tmp->n]])) {
            set_final_state(A->states[q]);
         }
      }
   }
}
free(renumber);
free_list_int(initials);
}
static int get_value_index_for_string_colon_string(const unichar* str1,const unichar* str2,struct string_hash* hash) {
   int value;
   unichar*allocated_buffer = NULL;
   unichar tmp_default[DEFAULT_TMP_GET_VALUE_INDEX_BUFFER_SIZE];
   unichar*tmp=tmp_default;
   int nb_unichar_buffer=u_strlen(str1)+u_strlen(str2)+2;
   if (nb_unichar_buffer>DEFAULT_TMP_GET_VALUE_INDEX_BUFFER_SIZE) {
	   tmp=allocated_buffer=(unichar*)malloc(sizeof(unichar*)*nb_unichar_buffer);
	   if (allocated_buffer==NULL) {
          fatal_alloc_error("get_value_index_for_string_colon_string");
	   }
   }   
   u_sprintf(tmp,"%S,%S",str1,str2);
   value=get_value_index(tmp,hash);
   if (allocated_buffer != NULL) {
     free(allocated_buffer);
   }
   return value;
}
Example #8
0
/**
 * For each color, a state of this color is chosen to represent the color.
 * The chosen number is >= its color number.
 */
int* choose_states(int* color,int nbColors,int nbStates) {
int* chosen=(int*)malloc(nbColors*sizeof(int));
if (chosen==NULL) {
   fatal_alloc_error("choose_states");
}
for (int c=0;c<nbColors;c++) {
   bool found=false;
   for (int s=c;!found && s<nbStates;s++) {
      if (color[s]==c) {
         chosen[c]=s;
         found=true;
      }
   }
   if (!found) {
      fatal_error("choose_states: color %d not found!\n",c);
   }
}
return chosen;
}
Example #9
0
/**
 * Inserts the graph number 'n' in the given condition list.
 */
void insert_graph_in_conditions(int n,ConditionList* l) {
ConditionList tmp;
if (*l==NULL) {
   /* If the condition list is empty, we create one */
   tmp=(ConditionList)malloc(sizeof(struct condition_list));
   if (tmp==NULL) {
      fatal_alloc_error("insert_graph_in_conditions");
   }
   tmp->next=NULL;
   tmp->condition=new_list_int(n);
   *l=tmp;
   return;
}
/* Otherwise, we insert the graph number in all the conditions of the list */
tmp=*l;
while (tmp!=NULL) {
   tmp->condition=sorted_insert(n,tmp->condition);
   tmp=tmp->next;
}
}
Example #10
0
/**
 * Allocates, initializes and returns an integer array that contains
 * the elements of the given list. '*size' is set to the size of this
 * array. Note that passing an empty list will return NULL.
 */
int* dump(struct list_int* list,int *size,Abstract_allocator prv_alloc) {
*size=0;
if (list==NULL) return NULL;
struct list_int* tmp=list;
/* We count the number of elements */
while (tmp!=NULL) {
   (*size)++;
   tmp=tmp->next;
}
int* result=(int*)malloc_cb((*size)*sizeof(int),prv_alloc);
if (result==NULL) {
   fatal_alloc_error("dump");
}
tmp=list;
for (int i=0;i<(*size);i++) {
   result[i]=tmp->n;
   tmp=tmp->next;
}
return result;
}
Example #11
0
/**
 * Allocates, initializes and returns a new tfst_match.
 */
struct tfst_match* new_tfst_match(int source_state_text,
                                  int dest_state_text,
                                  Transition* fst2_transition,
                                  int pos_kr,
                                  int text_tag_number,
                                  int first_time) {
struct tfst_match* match=(struct tfst_match*)malloc(sizeof(struct tfst_match));
if (match==NULL) {
   fatal_alloc_error("new_tfst_match");
}
match->source_state_text=source_state_text;
match->dest_state_text=dest_state_text;
match->fst2_transition=fst2_transition;
match->pos_kr=pos_kr;
match->text_tag_numbers=sorted_insert(text_tag_number,NULL);
match->next=NULL;
match->pointed_by=0;
match->first_time=first_time;
return match;
}
/**
 * This function adds the given token to the given token tree, if not already
 * present. Then, it adds the given transition to its transition list. 
 */
void add_tag(unichar* token,int tag_number,int dest_state,struct fst2txt_token_tree* tree, Abstract_allocator prv_alloc) {
int n=get_value_index(token,tree->hash);
if (n==tree->size) {
   /* If we have to create a new transition list because the token was not already in
    * the tree. */
   if (tree->size==tree->capacity) {
      /* If necessary, we double the size of the transition array */
      tree->capacity=2*tree->capacity;
      tree->transition_array=(Transition**)realloc_cb(tree->transition_array,(tree->capacity/2)*sizeof(Transition*),tree->capacity*sizeof(Transition*),prv_alloc);
      if (tree->transition_array==NULL) {
         fatal_alloc_error("add_tag");
      }
   }
   (tree->size)++;
   /* We don't forget to initialize the new transition list */
   tree->transition_array[n]=NULL;
}
/* We add the new transition, assuming that it is not already in the list, becauses
 * it would mean that the fst2 is not deterministic. */
tree->transition_array[n]=new_Transition(tag_number,dest_state,tree->transition_array[n],prv_alloc);
}
Example #13
0
/**
 * Allocates, initializes and returns an array that associates
 * a color (0 or 1) to each state of 'A', making sure that the
 * state #0 will be colored with 0. '*nbColors' will be set to
 * the number of colors that have been used (1 if all states
 * have the same finality; 2 otherwise).
 */
int* init_colors(SingleGraph A,int *nbColors) {
int* color=(int*)calloc(A->number_of_states,sizeof(int));
if (color==NULL) {
   fatal_alloc_error("init_colors");
}
/* bicolor will indicate if all states are of the same color (finality) or
 * not */
bool bicolor=false;
if (is_final_state(A->states[0])) {
   /* We distinguish two cases (initial state final or not), just
    * to ensure that the color of the initial state #0 will be 0 */
   for (int e=0;e<A->number_of_states;e++) {
      color[e]=is_final_state(A->states[e])?0:(bicolor=true,1);
   }
} else {
   for (int e=0;e<A->number_of_states;e++) {
      color[e]=is_final_state(A->states[e])?(bicolor=true,1):0;
   }
}
(*nbColors)=(bicolor?2:1);
return color;
}
/**
 * This function takes a fst2 and returns an array containing the corresponding
 * optimized states.
 */
OptimizedFst2State* build_optimized_fst2_states(Variables* v,OutputVariables* output,Fst2* fst2,
		Abstract_allocator prv_alloc) {
OptimizedFst2State* optimized_states=(OptimizedFst2State*)malloc_cb(fst2->number_of_states*sizeof(OptimizedFst2State),prv_alloc);
if (optimized_states==NULL) {
   fatal_alloc_error("build_optimized_fst2_states");
}

int num_current_graph=1;
int pos_in_current_graph=0;
for (int i=0;i<fst2->number_of_states;i++) {
   optimized_states[i]=optimize_state(v,output,fst2,fst2->states[i],fst2->tags,prv_alloc);

   optimized_states[i]->graph_number=num_current_graph;
   optimized_states[i]->pos_transition_in_fst2=i;
   optimized_states[i]->pos_transition_in_graph=pos_in_current_graph++;

   if (pos_in_current_graph >= *((fst2->number_of_states_per_graphs)+num_current_graph))
   {
	   num_current_graph++;
	   pos_in_current_graph=0;
   }
}

#ifdef AGGRESSIVE_OPTIMIZATION
int n_graphs_emptied;
do {
	n_graphs_emptied=0;
	for (int i=1;i<=fst2->number_of_graphs;i++) {
		n_graphs_emptied+=remove_useless_lexical_transitions(fst2,i,optimized_states,prv_alloc);
	}
} while (n_graphs_emptied!=0);
/* Finally, we convert token lists to sorted array suitable for binary search */
for (int i=0;i<fst2->number_of_states;i++) {
	token_list_2_token_array(optimized_states[i],prv_alloc);
}
#endif // AGGRESSIVE_OPTIMIZATION

return optimized_states;
}
Example #15
0
/**
 * Takes a string containing .bin names separated with semi-colons and
 * loads the corresponding dictionaries.
 */
void load_morphological_dictionaries(const char* morpho_dic_list,struct locate_parameters* p) {
    if (morpho_dic_list==NULL || morpho_dic_list[0]=='\0') {
        return;
    }
    p->n_morpho_dics=1+count_semi_colons(morpho_dic_list);
    p->morpho_dic_bin=(const unsigned char**)malloc(p->n_morpho_dics*sizeof(const unsigned char*));
    p->morpho_dic_bin_free=(struct BIN_free_info*)malloc(p->n_morpho_dics*sizeof(struct BIN_free_info));
    p->morpho_dic_inf=(const struct INF_codes**)malloc(p->n_morpho_dics*sizeof(struct INF_codes*));
    p->morpho_dic_inf_free=(struct INF_free_info*)malloc(p->n_morpho_dics*sizeof(struct INF_free_info));
    if (p->morpho_dic_bin==NULL || p->morpho_dic_inf==NULL || p->morpho_dic_bin_free==NULL || p->morpho_dic_inf_free==NULL) {
        fatal_alloc_error("load_morphological_dictionaries");
    }
    char bin[FILENAME_MAX];
    int pos;
    for (int i=0; i<p->n_morpho_dics; i++) {
        pos=0;
        while (*morpho_dic_list!='\0' && *morpho_dic_list!=';') {
            bin[pos++]=*morpho_dic_list;
            morpho_dic_list++;
        }
        bin[pos]='\0';
        if (*morpho_dic_list==';') {
            morpho_dic_list++;
        }
        p->morpho_dic_bin[i]=load_abstract_BIN_file(bin,&(p->morpho_dic_bin_free[i]));
        p->morpho_dic_inf[i]=NULL;
        if (p->morpho_dic_bin[i]!=NULL) {
            char inf[FILENAME_MAX];
            remove_extension(bin,inf);
            strcat(inf,".inf");
            p->morpho_dic_inf[i]=load_abstract_INF_file(inf,&(p->morpho_dic_inf_free[i]));
            if (p->morpho_dic_inf[i]==NULL) {
                free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i]));
                p->morpho_dic_bin[i]=NULL;
            }
        }
    }
}
Example #16
0
/**
 * Takes a string containing .bin names separated with semi-colons and
 * loads the corresponding dictionaries.
 */
void load_morphological_dictionaries(const char* morpho_dic_list,struct locate_parameters* p,
                                     const char* local_morpho_dic) {
    if (fexists(local_morpho_dic)) {
        if (morpho_dic_list!=NULL && morpho_dic_list[0]!='\0') {
            /* If we have both local and non-local dictionaries */
            char* temp;
            /* +2 because we have a ';' to insert */
            temp=(char*)malloc(strlen(local_morpho_dic)+strlen(morpho_dic_list)+2);
            if (temp==NULL) {
                fatal_alloc_error("load_morphological_dictionaries");
            }
            sprintf(temp,"%s;%s",local_morpho_dic,morpho_dic_list);
            load_morphological_dictionaries(temp,p);
            free(temp);
            return;
        } else {
            /* We just have the local one */
            return load_morphological_dictionaries(local_morpho_dic,p);
        }
    }
    /* We have no local dictionary*/
    load_morphological_dictionaries(morpho_dic_list,p);
}
/**
 * Allocates, initializes and returns a new match list element.
 */
struct match_list* new_match(int start,int end,int start_char,int end_char,
                             int start_letter,int end_letter,unichar* output,
                             int weight,struct match_list* next,Abstract_allocator prv_alloc) {
struct match_list *l;
l=(struct match_list*)malloc_cb(sizeof(struct match_list),prv_alloc);
if (l==NULL) {
   fatal_alloc_error("new_match");
}
l->m.start_pos_in_token=start;
l->m.end_pos_in_token=end;
l->weight=weight;
if (output==NULL) {
   l->output=NULL;
} else {
   l->output=u_strdup(output,prv_alloc);
}
l->m.start_pos_in_char=start_char;
l->m.end_pos_in_char=end_char;
l->m.start_pos_in_letter=start_letter;
l->m.end_pos_in_letter=end_letter;
l->next=next;
return l;
}
/**
 * Resizes the token array of the given block so that the token array can
 * contain 'new_number_of_elements' elements. The function doubles the size
 * of the array as many times as needed. If the array has already a sufficient
 * capacity, the function does nothing.
 */
void realloc_tct_hash_block(struct tct_hash_block* block,int new_number_of_elements, int token_array_base_memory_nb_item_size) {
if (block->size >=new_number_of_elements)
  return;
int factor=2;
while (block->size*factor < new_number_of_elements) {
   factor*=2;
}
if ((block->size) == token_array_base_memory_nb_item_size) {
  if (factor < 4) factor=4;
  int* new_array=(int*)malloc(block->size*sizeof(int)*factor);
  for (int i=0;i<block->length;i++) {
	  new_array[i]=block->token_array[i];
  }
  block->token_array=new_array;
}
else
  block->token_array=(int*)realloc(block->token_array,block->size*sizeof(int)*factor);

block->size*=factor;
if (block->token_array==NULL) {
   fatal_alloc_error("realloc_tct_hash_block");
}
}
Example #19
0
/**
 * Returns the index value associated to the given key. 'value' will be associated to
 * the given key if the key is not already present in the string_hash_ptr.
 */
int get_value_index(const unichar* key,struct string_hash_ptr* hash,int insert_policy,void* value) {
int size=hash->hash->size;
int index=get_value_index_(key,0,hash->hash->root,hash->hash,insert_policy,NULL);
if (index==-1) {
   /* If the key was neither found nor inserted, we return -1 */
   return -1;
}
if (hash->hash->size!=size) {
   hash->size=hash->hash->size;
   /* If the key was inserted, we add the corresponding value into the 'value' array */
   /* Otherwise: if there is a maximum capacity */
   if (hash->hash->size==hash->capacity) {
      /* We enlarge the 'value' array, doubling its capacity */
      hash->capacity=2*hash->capacity;
      hash->value=(void**)realloc(hash->value,sizeof(void*)*hash->capacity);
      if (hash->value==NULL) {
         fatal_alloc_error("get_value_index\n");
      }
   }
   hash->value[index]=value;
}
return index;
}
Example #20
0
int main_CheckDic(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}

int is_a_DELAF=-1;
int strict_unprotected=0;
int skip_path=0;
char alph[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
int space_warnings=1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_CheckDic,lopts_CheckDic,&index,vars))) {
   switch(val) {
   case 'f': is_a_DELAF=1; break;
   case 's': is_a_DELAF=0; break;
   case 'h': usage(); return 0;
   case 'r': strict_unprotected=1; break;
   case 't': strict_unprotected=0; break;
   case 'n': space_warnings=0; break;
   case 'p': skip_path=1; break;
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("Empty alphabet argument\n");
             }
             strcpy(alph,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_CheckDic[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (is_a_DELAF==-1 || vars->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return 1;
}

U_FILE* dic=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (dic==NULL) {
	fatal_error("Cannot open dictionary %s\n",argv[vars->optind]);
}
Alphabet* alphabet0=NULL;
if (alph[0]!='\0') {
   alphabet0=load_alphabet(alph,1);
}
char output_filename[FILENAME_MAX];
get_path(argv[vars->optind],output_filename);
strcat(output_filename,"CHECK_DIC.TXT");
U_FILE* out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output_filename,U_WRITE);
if (out==NULL) {
	u_fclose(dic);
	fatal_error("Cannot create %s\n",output_filename);
}
u_printf("Checking %s...\n",argv[vars->optind]);
unichar line[CHECKDIC_LINE_SIZE];
int line_number=1;
/*
 * We declare and initialize an array in order to know which
 * letters are used in the dictionary.
 */
int i;
char* alphabet=(char*)malloc(sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS);
if (alphabet==NULL) {
	fatal_alloc_error("CheckDic's main");
}
memset(alphabet,0,sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS);
/*
 * We use two structures for the storage of the codes found in the
 * dictionary. Note that 'semantic_codes' is used to store both grammatical and
 * semantic codes.
 */
struct string_hash* semantic_codes=new_string_hash();
struct string_hash* inflectional_codes=new_string_hash();
struct string_hash* simple_lemmas=new_string_hash(DONT_USE_VALUES);
struct string_hash* compound_lemmas=new_string_hash(DONT_USE_VALUES);
int n_simple_entries=0;
int n_compound_entries=0;
/*
 * We read all the lines and check them.
 */
while (EOF!=u_fgets_limit2(line,DIC_LINE_SIZE,dic)) {
   if (line[0]=='\0') {
		/* If we have an empty line, we print a unicode error message
		 * into the output file */
		u_fprintf(out,"Line %d: empty line\n",line_number);
	}
	else if (line[0]=='/') {
		/* If a line starts with '/', it is a commment line, so
		 * we ignore it */
	}
	else {
		/* If we have a line to check, we check it according to the
		 * dictionary type */
		check_DELA_line(line,out,is_a_DELAF,line_number,alphabet,semantic_codes,
		                inflectional_codes,simple_lemmas,compound_lemmas,
		                &n_simple_entries,&n_compound_entries,alphabet0,strict_unprotected);
	}
	/* At regular intervals, we display a message on the standard
	 * output to show that the program is working */
	if (line_number%10000==0) {
		u_printf("%d lines read...\r",line_number);
	}
	line_number++;
}
u_printf("%d lines read\n",line_number-1);
u_fclose(dic);
/*
 * Once we have checked all the lines, we print some informations
 * in the output file.
 */
u_fprintf(out,"-----------------------------------\n");
u_fprintf(out,"-------------  Stats  -------------\n");
u_fprintf(out,"-----------------------------------\n");
if (skip_path != 0) { 
    char filename_without_path[FILENAME_MAX];
    remove_path(argv[vars->optind],filename_without_path);
    u_fprintf(out,"File: %s\n",filename_without_path);
}
else {
    u_fprintf(out,"File: %s\n",argv[vars->optind]);
}
u_fprintf(out,"Type: %s\n",is_a_DELAF?"DELAF":"DELAS");
u_fprintf(out,"%d line%s read\n",line_number-1,(line_number-1>1)?"s":"");
u_fprintf(out,"%d simple entr%s ",n_simple_entries,(n_simple_entries>1)?"ies":"y");
u_fprintf(out,"for %d distinct lemma%s\n",simple_lemmas->size,(simple_lemmas->size>1)?"s":"");
u_fprintf(out,"%d compound entr%s ",n_compound_entries,(n_compound_entries>1)?"ies":"y");
u_fprintf(out,"for %d distinct lemma%s\n",compound_lemmas->size,(compound_lemmas->size>1)?"s":"");
/**
 * We print the list of the characters that are used, with
 * their unicode numbers shown in hexadecimal. This can be useful
 * to detect different characters that are graphically identical
 * like 'A' (upper of latin 'a' or upper of greek alpha ?).
 */
u_fprintf(out,"-----------------------------------\n");
u_fprintf(out,"----  All chars used in forms  ----\n");
u_fprintf(out,"-----------------------------------\n");
unichar r[4];
unichar r2[7];
r[1]=' ';
r[2]='(';
r[3]='\0';
r2[5]='\n';
r2[6]='\0';
for (i=0;i<MAX_NUMBER_OF_UNICODE_CHARS;i++) {
	if (alphabet[i]) {
      u_fprintf(out,"%C (%04X)\n",i,i);
	}
}
/*
 * Then we print the list of all grammatical and semantic codes used in the
 * dictionary. If a code contains a non ASCII character, a space or a tabulation,
 * we print a warning.
 */
u_fprintf(out,"-------------------------------------------------------------\n");
u_fprintf(out,"----  %3d grammatical/semantic code%s",semantic_codes->size,(semantic_codes->size>1)?"s used in dictionary  ----\n":" used in dictionary  -----\n");
u_fprintf(out,"-------------------------------------------------------------\n");
unichar comment[2000];
for (i=0;i<semantic_codes->size;i++) {
	/* We print the code, followed if necessary by a warning */
	u_fprintf(out,"%S",semantic_codes->value[i]);
	if (warning_on_code(semantic_codes->value[i],comment,space_warnings)) {
		u_fprintf(out," %S",comment);
	}
	u_fprintf(out,"\n");
}
/*
 * Finally, we print the list of inflectional codes,
 * with warnings in the case of non ASCII letters, spaces
 * or tabulations.
 */
u_fprintf(out,"-----------------------------------------------------\n");
u_fprintf(out,"----  %3d inflectional code%s",inflectional_codes->size,(inflectional_codes->size>1)?"s used in dictionary  ----\n":" used in dictionary  -----\n");
u_fprintf(out,"-----------------------------------------------------\n");


for (i=0;i<inflectional_codes->size;i++) {
	u_fprintf(out,"%S",inflectional_codes->value[i]);
	if (warning_on_code(inflectional_codes->value[i],comment,space_warnings)) {
		u_fprintf(out," %S",comment);
	}
	u_fprintf(out,"\n");
}
u_fclose(out);
free_OptVars(vars);
u_printf("Done.\n");
/* Note that we don't free anything since it would only waste time */

free(alphabet);
if (alphabet0!=NULL) {
   free_alphabet(alphabet0);
}
#if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT))
/* cleanup for no leak on library */
free_string_hash(semantic_codes);
free_string_hash(inflectional_codes);
free_string_hash(simple_lemmas);
free_string_hash(compound_lemmas);
#endif
return 0;
}
/**
 * This function analyzes the given Elag rule automaton to find
 * where the rule and constraint parts are. As a side effect, it builds
 * a fst2 grammar ("foo.fst2" => "foo-conc.fst2") that can be used by
 * the Locate program to match the <!> .... <!> .... <!> part of the rule.
 */
void split_elag_rule(elRule* rule, const VersatileEncodingConfig* vec,language_t* language) {
int c;
/* This array contains the numbers of the states that are pointed to by
 * middle '<=>' of the constraints */
int constraints[ELAG_MAX_CONSTRAINTS];
int nbConstraints=count_constraints(rule->automaton,constraints);
/* +1 because we have to count the <!> .... <!> .... <!> part of the rule */
rule->nbContexts=nbConstraints+1;
rule->contexts=(elContext*)malloc(rule->nbContexts*sizeof(elContext));
if (rule->contexts==NULL) {
   fatal_alloc_error("split_elag_rule");
}
for (c=0;c<rule->nbContexts;c++) {
   rule->contexts[c].left=NULL;
   rule->contexts[c].right=NULL;
}
int endR1=ELAG_UNDEFINED;
int endR2=ELAG_UNDEFINED;
int endC2=ELAG_UNDEFINED;
for (Transition* t=rule->automaton->automaton->states[0]->outgoing_transitions;t!=NULL;t=t->next) {
   symbol_t* symbol=t->label;
   switch (symbol->type) {
      /* We split the unique <!> .... <!> .... <!> part */
      case S_EXCLAM:
         if (rule->contexts[0].left!=NULL) {
            fatal_error("Too much '<!>' tags\n",rule->name);
         }
         rule->contexts[0].left=new_SingleGraph(PTR_TAGS);
         /* We look for the end of the first part of the rule */
         endR1=get_sub_automaton(rule->automaton->automaton,rule->contexts[0].left,t->state_number,0,S_EXCLAM);
         rule->contexts[0].right=new_SingleGraph(PTR_TAGS);
         endR2=get_sub_automaton(rule->automaton->automaton,rule->contexts[0].right,endR1,0,S_EXCLAM);
         if (endR1==ELAG_UNDEFINED || endR2==ELAG_UNDEFINED
             || !is_final_state(rule->automaton->automaton->states[endR2])) {
            fatal_error("split_elag_rule: %s: parse error in <!> part\n",rule->name);
         }
         break;

      /* We split the nbConstraints <=> .... <=> .... <=> parts */
      case S_EQUAL:
         if (rule->contexts[1].left!=NULL) {
            fatal_error("Non deterministic .fst2 file\n");
         }
         for (c=0;c<nbConstraints;c++) {
            rule->contexts[c+1].left=new_SingleGraph(PTR_TAGS);
            get_sub_automaton(rule->automaton->automaton,rule->contexts[c+1].left,t->state_number,1,constraints[c]);
            rule->contexts[c+1].right=new_SingleGraph(PTR_TAGS);
            endC2=get_sub_automaton(rule->automaton->automaton,rule->contexts[c+1].right,constraints[c],0,S_EQUAL);
            if (endC2==ELAG_UNDEFINED || !is_final_state(rule->automaton->automaton->states[endC2])) {
               fatal_error("split_elag_rule: %s: parse error in <=> part\n",rule->name);
            }
         }
         break;

      default: fatal_error("Left delimitor '<!>' or '<=>' missing\n");
   }
}
if (rule->contexts[0].left==NULL) {
   fatal_error("In grammar '%s': symbol '<!>' not found.\n",rule->name);
}
char buf[FILENAME_MAX];
remove_extension(rule->name,buf);
strcat(buf,"-conc.fst2");

/* We create the.fst2 to be used by Locate */
Fst2Automaton* locate=make_locate_automaton(rule,language);
save_automaton(locate,buf,vec,FST_LOCATE);
free_Fst2Automaton(locate,free_symbol);
}
Example #22
0
int main_ConcorDiff(int argc,char* const argv[]) {
if (argc==1) {
	usage();
	return 0;
}


int val,index=-1;
char* out=NULL;
char* font=NULL;
int size=0;
char foo;
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_ConcorDiff,lopts_ConcorDiff,&index,vars))) {
   switch(val) {
   case 'o': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty output file\n");
             }
             out=strdup(vars->optarg);
             if (out==NULL) {
                fatal_alloc_error("main_ConcorDiff");
             }
             break;
   case 'f': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty font name\n");
             }
             font=strdup(vars->optarg);
             if (font==NULL) {
                fatal_alloc_error("main_ConcorDiff");
             }
             break;
   case 's': if (1!=sscanf(vars->optarg,"%d%c",&size,&foo)
                 || size<=0) {
                /* foo is used to check that the font size is not like "45gjh" */
                fatal_error("Invalid font size argument: %s\n",vars->optarg);
             }
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_ConcorDiff[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   }
   index=-1;
}

if (out==NULL) {
   fatal_error("You must specify the output file\n");
}
if (font==NULL) {
   fatal_error("You must specify the font to use\n");
}
if (size==0) {
   fatal_error("You must specify the font size to use\n");
}
if (vars->optind!=argc-2) {
   error("Invalid arguments: rerun with --help\n");
   return 1;
}
diff(encoding_output,bom_output,mask_encoding_compatibility_input,argv[vars->optind],argv[vars->optind+1],out,font,size);
free(out);
free(font);
free_OptVars(vars);
return 0;
}
Example #23
0
/**
 * This function produces a normalized version of 'input' and stores it into 'ouput'.
 * The following rules are applied in the given order:
 *
 * 1) If there is a { at the current position, we try to read a {S}, a {STOP} or
 *    a tag token like {today,.ADV}. If we fail, we replace the { and the }, if any,
 *    according to the replacement rules. Otherwise, we let the token unchanged.
 * 2) If there is one or more replacement rules that can apply to the current
 *    position in 'input', then we apply the longest one.
 * 3) If we we find a separator (space, tab, new line) sequence, we replace it:
 *    - by a new line if the sequence contains one and if 'carriage_return_policy' is
 *      set to KEEP_CARRIAGE_RETURN;
 *    - by a space otherwise.
 * 4) We copy the character that was read to the output.
 *
 * Note that 'replacements' is supposed to contain replacement rules for { and }
 */
int normalize(const char *fin, const char *fout, 
              Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input,
              int carriage_return_policy, const char *rules) {
	U_FILE* input;
	input = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,fin,U_READ);
	if (input == NULL) {
		error("Cannot open file %s\n", fin);
		return 1;
	}

	U_FILE* output;
	output = u_fopen_creating_versatile_encoding(encoding_output,bom_output,fout,U_WRITE);
	if (output == NULL) {
		error("Cannot create file %s\n", fout);
		u_fclose(input);
		return 1;
	}

	struct string_hash* replacements=NULL;
	if(rules != NULL && rules[0]!='\0') {
		replacements=load_key_value_list(rules,mask_encoding_compatibility_input,'\t');
		if (replacements==NULL) {
		   error("Cannot load replacement rules file %s\n", rules);
		   replacements=new_string_hash();
		}
	}
	/* If there is no replacement rules file, we simulate one */
	else {
	   replacements=new_string_hash();
	}

	/* If there is a replacement rule file, we ensure that there are replacement
	 * rules for { and }. If not, we add our default ones, so that in any case,
	 * we are sure to have rules for { and } */
	unichar key[2];
	unichar value[2];
	u_strcpy(key,"{");
	u_strcpy(value,"[");
	get_value_index(key,replacements,INSERT_IF_NEEDED,value);
	u_strcpy(key,"}");
	u_strcpy(value,"]");
	get_value_index(key,replacements,INSERT_IF_NEEDED,value);

    struct OUTBUF OutBuf;
    OutBuf.pos=0;
	unichar tmp[MAX_TAG_LENGTH];
	//struct buffer* buffer=new_buffer_for_file(UNICHAR_BUFFER,input);

    long save_pos=ftell(input);
    fseek(input,0,SEEK_END);
    long file_size_input=ftell(input);
    fseek(input,save_pos,SEEK_SET);

    int line_buffer_size = (int)(((file_size_input+1) < MAX_LINE_BUFFER_SIZE) ? (file_size_input+1) : MAX_LINE_BUFFER_SIZE);

    unichar *line_read;
    line_read=(unichar*)malloc((line_buffer_size+0x10)*sizeof(unichar));
    if (line_read==NULL) {
        fatal_alloc_error("normalize");
    }

	/* We define some things that will be used for parsing the buffer */


    static const unichar stop_chars[]= { '{', '}', 0 };
    static const unichar forbidden_chars[]= { '\n', 0 };
    static const unichar open_bracket[]= { '{', 0 };
    static const unichar close_bracket[]= { '}', 0 };
    static const unichar empty_string[]= { 0 };

   int corrupted_file=0;
   int eof_found=0;
   /* First, we fill the buffer */
	
    int lastline_was_terminated=0;

    while (eof_found==0) {
        int current_start_pos=0;
        int found_null=0;
        const unichar*buff=line_read;
        int result_read = 0;

        result_read = u_fgets_treat_cr_as_lf(line_read,line_buffer_size,input,1,&found_null);
        if ((found_null != 0) && (corrupted_file==0)) {
          corrupted_file=1;
          error("Corrupted text file containing NULL characters!\n");
          error("They have been ignored by Normalize, but you should clean your text\n");
        }

        if (result_read>0)
            if (line_read[result_read-1]==0x0d)
                line_read[result_read-1]='\n';
        
        if (result_read==EOF)
            break;

        if (lastline_was_terminated != 0)
            while (current_start_pos<result_read) {
                if (buff[current_start_pos]!=' ' && buff[current_start_pos]!='\t'
							    && buff[current_start_pos]!=0x0d
                                && buff[current_start_pos]!='\n')
                                break;
                current_start_pos++;
            }

        lastline_was_terminated = 0;
        if (result_read > 0)
            if ((buff[result_read-1]=='\n') || (buff[result_read-1]==0x0d))
                lastline_was_terminated = 1;


        while (current_start_pos<result_read) {
            if ((lastline_was_terminated == 0) && (eof_found == 0) && 
                (current_start_pos + MINIMAL_CHAR_IN_BUFFER_BEFORE_CONTINUE_LINE >= result_read))
            {
                int i;
                int nb_to_keep = result_read-current_start_pos;
                for (i=0;i<nb_to_keep;i++)
                    line_read[i]=line_read[current_start_pos+i];
                int found_null_read=0;
                int result_read_continue = u_fgets_treat_cr_as_lf(line_read+nb_to_keep,line_buffer_size-nb_to_keep,input,1,&found_null_read);

                if ((found_null_read != 0) && (corrupted_file==0)) {
                    corrupted_file=1;
                    error("Corrupted text file containing NULL characters!\n");
                    error("They have been ignored by Normalize, but you should clean your text\n");
                }

                if (result_read_continue>0)
                    if (line_read[(result_read_continue+nb_to_keep)-1]==0x0d)
                        line_read[(result_read_continue+nb_to_keep)-1]='\n';
                lastline_was_terminated = 0;
                if (result_read_continue==EOF)
                    eof_found = lastline_was_terminated = 1;

                if (result_read_continue > 0)
                    if ((buff[(result_read_continue+nb_to_keep)-1]=='\n') || (buff[(result_read_continue+nb_to_keep)-1]==0x0d))
                        lastline_was_terminated = 1;

                result_read = nb_to_keep;
                current_start_pos = 0;

                if (result_read_continue > 0)
                    result_read += result_read_continue;
            }

		if (buff[current_start_pos]=='{') {
			/* If we have a {, we try to find a sequence like {....}, that does not contain
			 * new lines. If the sequence contains protected character, we want to keep them
			 * protected. */
			int old_position=current_start_pos;
			/* If we don't increase the position, the parse will stop on the initial { */
			current_start_pos++;
			tmp[0]='{';
			int code=parse_string(buff,&current_start_pos,&(tmp[1]),stop_chars,forbidden_chars,NULL);
			if (code==P_FORBIDDEN_CHAR || code==P_BACKSLASH_AT_END || buff[current_start_pos]!='}') {
				/* If we have found a new line or a {, or if there is
				 * a backslash at the end of the buffer, or if we have reached the end
				 * of the buffer, we assume that the initial
				 * { was not a tag beginning, so we print the substitute of { */
				WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0);
				/* And we rewind the current position after the { */
				current_start_pos=old_position+1;
			}
			else {
				/* If we have read a sequence like {....}, we assume that there won't be
				 * a buffer overflow if we add the } */
				u_strcat(tmp,close_bracket);
				if (!u_strcmp(tmp,"{S}") || !u_strcmp(tmp,"{STOP}") || check_tag_token(tmp)) {
					/* If this is a special tag or a valid tag token, we just print
					 * it to the output */
					WriteOufBuf(&OutBuf,tmp,output, 0);
					current_start_pos++;
				}
				else {
					/* If we have a non valid tag token, we print the equivalent of {
					 * and we rewind the current position after the { */
					WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0);
					current_start_pos=old_position+1;
				}
			}
		}
		else {
			/* If we have a character that is not {, first we try to look if there
			 * is a replacement to do */
			int key_length;
			int index=get_longest_key_index(&buff[current_start_pos],&key_length,replacements);
			if (index!=NO_VALUE_INDEX) {
				/* If there is something to replace */
				WriteOufBuf(&OutBuf,replacements->value[index],output, 0);
				current_start_pos=current_start_pos+key_length;
			}
			else {
				if (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
					/* If we have a separator, we try to read the longest separator sequence
					 * that we can read. By the way, we note if it contains a new line */
					int new_line=0;
					while (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t'
							|| buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
						/* Note 1: no bound check is needed, since an unichar buffer is always
						 *        ended by a \0
						 *
						 * Note 2: we don't take into account the case of a buffer ended by
						 *         separator while it's not the end of file: that would mean
						 *         that the text contains something like MARGIN_BEFORE_BUFFER_END
						 *         contiguous separators. Such a text would not be a reasonable one.
						 */
						if (buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
							new_line=1;
						}
						current_start_pos++;
					}
					if (new_line && (carriage_return_policy==KEEP_CARRIAGE_RETURN)) {
						/* We print a new line if the sequence contains one and if we are
						 * allowed to; otherwise, we print a space. */
						WriteOufBuf(&OutBuf,'\n',output, 0);
					}
					else {
						WriteOufBuf(&OutBuf,' ',output, 0);
					}
				}
				else {
					/* If, finally, we have a normal character to normalize, we just print it */
                    WriteOufBuf(&OutBuf,buff[current_start_pos++],output, 0);
				}
			}
		}
	    }
    }


    WriteOufBuf(&OutBuf,empty_string,output, 1);

	free(line_read);
	free_string_hash(replacements);

	u_fclose(input);
	u_fclose(output);
	return 0;
}
Example #24
0
/////////////////////////////////////////////////////////////////////////////////
// Scans a single unit from a DELAC entry. 'line' is non terminated by a newline.
// Initially, 'u' has its space allocated but is empty.
// Returns the length of the scanned sequence, -1 if a format error occurred, -2 if a memory allocation problem occured.
int DLC_scan_unit(Alphabet* alph,struct l_morpho_t* pL_MORPHO,SU_id_T* u, unichar* line, d_class_equiv_T* D_CLASS_EQUIV) {
	int l; //length of the scanned sequence
	int pos; //index of the next caracter to be scanned
	unichar tmp[DIC_LINE_SIZE];

	pos = 0;
	//Scan a unit
	l = SU_get_unit(tmp, line, DIC_LINE_SIZE - 1, alph, 0); //The single word module determines what is a word and what is a separator, etc.
	if (l <= 0) {
		return -1;
	}
	u->form = u_strdup(tmp);
	pos += l;

	//If no lemma indication
	if (line[pos] != (unichar) '(') {
		u->lemma = NULL;
		u->feat = NULL;
	}

	//Scan the unit's description contained between '(' and ')'
	else {
		pos++; //Omit the '('
		//Scan the lemma if any
		u->lemma = (SU_lemma_T*) malloc(sizeof(SU_lemma_T));
		if (!u->lemma) {
			fatal_alloc_error("DLC_scan_unit");
		}
		l = SU_get_unit(tmp, &(line[pos]), DIC_LINE_SIZE - 1, alph, 0); //The single word module determines what is a word and what is a separator, etc.
		if (l < 0) {
			free(u->form);
			SU_delete_lemma(u->lemma);
			return l;
		}
		u->lemma->unit = u_strdup(tmp);
		pos += l;

		//Scan the lemma's inflection paradigm
		if (line[pos] != (unichar) '.') {
			error("Dot missing after a unit's lemma:\n%S\n", line);
			free(u->form);
			SU_delete_lemma(u->lemma);
			return -1;
		}
		pos++; //Omit the dot
		unichar u_para[DIC_LINE_SIZE];
		l = u_scan_until_char(u_para, &(line[pos]), DIC_LINE_SIZE - 1, "+:\\",
				1);
		if (!l) {
			error(
					"Unit's inflection paradigm non existent in DELAC line:\n%S\n",
					line);
			free(u->form);
			SU_delete_lemma(u->lemma);
			return -1;
		}
		u->lemma->paradigm = (char*) malloc((u_strlen(u_para) + 1)
				* sizeof(char));
		if (!u->lemma->paradigm) {
			fatal_alloc_error("DLC_scan_unit");
		}
		for (unsigned int c = 0; c <= u_strlen(u_para); c++)
			u->lemma->paradigm[c] = (char) u_para[c];

		//Determine the lemma's inflection class (noun, adj, etc.)
		l_class_T* cl;
		cl = DLC_class_para(u_para, D_CLASS_EQUIV);
		if (!cl) {
			error(
					"Impossible to deduce the unit's inflection class (noun, adj, etc.):\n%S\n",
					line);
			free(u->form);
			SU_delete_lemma(u->lemma);
			return -1;
		}
		u->lemma->cl = cl;
		pos += l;

		//Scan the unit's inflection features
		unichar tmp_scan[DIC_LINE_SIZE];
		if (line[pos] != (unichar) ':') {
			error("Colon missing after a unit's lemma:\n%S\n", line);
			free(u->form);
			SU_delete_lemma(u->lemma);
			return -1;
		}
		pos++; //Omit the colon
		l = u_scan_until_char(tmp_scan, &(line[pos]), DIC_LINE_SIZE - 1, ")", 1);
		if (l <= 0) {
			error("Inflection features missing after ':' for a unit:\n%S\n",
					line);
			free(u->form);
			SU_delete_lemma(u->lemma);
			return -1;
		}
		pos += l;
		if (line[pos] != (unichar) ')') {
			error("')' missing after a unit's inflection features:\n%S\n", line);
			free(u->form);
			SU_delete_lemma(u->lemma);
			return -1;
		}
		pos++; //Omit the ')'
		u->feat = d_get_feat_str(pL_MORPHO,tmp_scan);
		if (!u->feat) {
			error("Incorrect inflection features in a unit:\n%S\n", line);
			free(u->form);
			SU_delete_lemma(u->lemma);
			return -1;
		}
	}
	return pos;
}
Example #25
0
void scan_graph(int n_graph,         // number of current graph
                     int e,          // number of current state
                     int pos,        //
                     int depth,
                     struct parsing_info** liste_arrivee,
                     unichar* mot_token_buffer,
                     struct fst2txt_parameters* p,Abstract_allocator prv_alloc_recycle) {
Fst2State etat_courant=p->fst2->states[e];
if (depth > MAX_DEPTH) {

  error(  "\n"
          "Maximal stack size reached in graph %i!\n"
          "Recognized more than %i tokens starting from:\n"
          "  ",
          n_graph, MAX_DEPTH);
  for (int i=0; i<60; i++) {
    error("%S",p->buffer[p->current_origin+i]);
  }
  error("\nSkipping match at this position, trying from next token!\n");
  p->output[0] = '\0';  // clear output
  p->input_length = 0; // reset taille_entree
  empty(p->stack);    // clear output stack
  if (liste_arrivee != NULL) {
    while (*liste_arrivee != NULL) { // free list of subgraph matches
      struct parsing_info* la_tmp=*liste_arrivee;
      *liste_arrivee=(*liste_arrivee)->next;
      la_tmp->next=NULL; // to don't free the next item
      free_parsing_info(la_tmp, prv_alloc_recycle);
    }
  }
  return;
  //  exit(1); // don't exit, try at next position
}
depth++;

if (is_final_state(etat_courant)) {
   // if we are in a final state
  p->stack->stack[p->stack->stack_pointer+1]='\0';
  if (n_graph == 0) { // in main graph
    if (pos>=p->input_length/*sommet>u_strlen(output)*/) {
      // and if the recognized input is longer than the current one, it replaces it
      u_strcpy(p->output,p->stack->stack);
      p->input_length=(pos);
    }
  } else { // in a subgraph
    (*liste_arrivee)=insert_if_absent(pos,-1,-1,(*liste_arrivee),p->stack->stack_pointer+1,
                                      p->stack->stack,p->variables,NULL,NULL,-1,-1,NULL,-1, prv_alloc_recycle);
  }
}

if (pos+p->current_origin==p->text_buffer->size) {
   // if we are at the end of the text, we return
   return;
}

int SOMMET=p->stack->stack_pointer+1;
int pos2;

/* If there are some letter sequence transitions like %hello, we process them */
if (p->token_tree[e]->transition_array!=NULL) {
   if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');}
   /* we don't keep this line because of problems occur in sentence tokenizing
    * if the return sequence is defautly considered as a separator like space
    else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);}
    */
   else pos2=pos;
   int position=0;
   unichar *token=mot_token_buffer;
   if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION
       || (is_letter(p->buffer[pos2+p->current_origin],p->alphabet) && (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet)))) {
      /* If we are in character by character mode */
      while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) {
         token[position++]=p->buffer[(pos2++)+p->current_origin];
         if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION) {
            break;
         }
      }
      token[position]='\0';
      if (position!=0 &&
          (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || !(is_letter(token[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)))) {
       // we proceed only if we have exactly read the contenu sequence
       // in both modes MERGE and REPLACE, we process the transduction if any
       int SOMMET2=p->stack->stack_pointer;
       Transition* RES=get_matching_tags(token,p->token_tree[e],p->alphabet);
       Transition* TMP;
       unichar* mot_token_new_recurse_buffer=NULL;
       if (RES!=NULL) {
          // we allocate a new mot_token_buffer for the scan_graph recursin because we need preserve current
          // token=mot_token_buffer
          mot_token_new_recurse_buffer=(unichar*)malloc(MOT_BUFFER_TOKEN_SIZE*sizeof(unichar));
          if (mot_token_new_recurse_buffer==NULL) {
            fatal_alloc_error("scan_graph");
          }
       }
       while (RES!=NULL) {
           p->stack->stack_pointer=SOMMET2;
          Fst2Tag etiq=p->fst2->tags[RES->tag_number];
          traiter_transduction(p,etiq->output);
          int longueur=u_strlen(etiq->input);
          unichar C=token[longueur];
          token[longueur]='\0';
          if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
             // if we are in MERGE mode, we add to ouput the char we have read
             push_input_string(p->stack,token,0);
          }
          token[longueur]=C;
          scan_graph(n_graph,RES->state_number,pos2-(position-longueur),depth,liste_arrivee,mot_token_new_recurse_buffer,p);
          TMP=RES;
          RES=RES->next;
          free(TMP);
       }
       if (mot_token_new_recurse_buffer!=NULL) {
         free(mot_token_new_recurse_buffer);
       }
   }
}
}

Transition* t=etat_courant->transitions;
while (t!=NULL) {
    p->stack->stack_pointer=SOMMET-1;
      // we process the transition of the current state
      int n_etiq=t->tag_number;
      if (n_etiq<0) {
         // case of a sub-graph
         struct parsing_info* liste=NULL;
         unichar* pile_old;
         p->stack->stack[p->stack->stack_pointer+1]='\0';
         pile_old = u_strdup(p->stack->stack);
         scan_graph((((unsigned)n_etiq)-1),p->fst2->initial_states[-n_etiq],pos,depth,&liste,mot_token_buffer,p);
         while (liste!=NULL) {
            p->stack->stack_pointer=liste->stack_pointer-1;
            u_strcpy(p->stack->stack,liste->stack);
            scan_graph(n_graph,t->state_number,liste->position,depth,liste_arrivee,mot_token_buffer,p);
            struct parsing_info* l_tmp=liste;
            liste=liste->next;
            l_tmp->next=NULL; // to don't free the next item
            free_parsing_info(l_tmp, prv_alloc_recycle);
         }
         u_strcpy(p->stack->stack,pile_old);
         free(pile_old);
         p->stack->stack_pointer=SOMMET-1;
      }
      else {
         // case of a normal tag
         Fst2Tag etiq=p->fst2->tags[n_etiq];
         unichar* contenu=etiq->input;
         int contenu_len_possible_match=u_len_possible_match(contenu);
         if (etiq->type==BEGIN_OUTPUT_VAR_TAG) {
        	 fatal_error("Unsupported $|XXX( tags in Fst2Txt\n");
         }
         if (etiq->type==END_OUTPUT_VAR_TAG) {
           	 fatal_error("Unsupported $|XXX) tags in Fst2Txt\n");
         }
         if (etiq->type==BEGIN_VAR_TAG) {
            // case of a $a( variable tag
            //int old;
            struct transduction_variable* L=get_transduction_variable(p->variables,etiq->variable);
            if (L==NULL) {
               fatal_error("Unknown variable: %S\n",etiq->variable);
            }
            //old=L->start;
            if (p->buffer[pos+p->current_origin]==' ' && pos+p->current_origin+1<p->text_buffer->size) {
               pos2=pos+1;
               if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');
            }
            //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);}
            else pos2=pos;
            L->start_in_tokens=pos2;
            scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p);
            //L->start=old;
         }
         else if (etiq->type==END_VAR_TAG) {
              // case of a $a) variable tag
              //int old;
              struct transduction_variable* L=get_transduction_variable(p->variables,etiq->variable);
              if (L==NULL) {
                 fatal_error("Unknown variable: %S\n",etiq->variable);
              }
              //old=L->end;
              if (pos>0)
                L->end_in_tokens=pos-1;
              else L->end_in_tokens=pos;
              // BUG: qd changement de buffer, penser au cas start dans ancien buffer et end dans nouveau
              scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p);
              //L->end=old;
         }
         else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MOT_LN5))) {
              // case of transition by any sequence of letters
              if (p->buffer[pos+p->current_origin]==' ' && pos+p->current_origin+1<p->text_buffer->size) {
                 pos2=pos+1;
                 if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');
              }
              //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);}
              else pos2=pos;
              unichar* mot=mot_token_buffer;
              int position=0;
              if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION ||
                  ((pos2+p->current_origin)==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) {
                     while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) {
                           mot[position++]=p->buffer[(pos2++)+p->current_origin];
                     }
                     mot[position]='\0';
                     if (position!=0) {
                       // we proceed only if we have read a letter sequence
                       // in both modes MERGE and REPLACE, we process the transduction if any
                       traiter_transduction(p,etiq->output);
                       if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                         // if we are in MERGE mode, we add to ouput the char we have read
                         push_output_string(p->stack,mot);
                       }
                       scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p);
                     }
              }
         }
         else if ((contenu_len_possible_match==4) && (!u_trymatch_superfast4(contenu,ETIQ_NB_LN4))) {
              // case of transition by any sequence of digits
              if (p->buffer[pos+p->current_origin]==' ') {
                 pos2=pos+1;
                 if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');
              }
              //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);}
              else pos2=pos;
              unichar* mot=mot_token_buffer;
              int position=0;
              while (pos2+p->current_origin<p->text_buffer->size && (p->buffer[pos2+p->current_origin]>='0')
                     && (p->buffer[pos2+p->current_origin]<='9')) {
                 mot[position++]=p->buffer[(pos2++)+p->current_origin];
              }
              mot[position]='\0';
              if (position!=0) {
                 // we proceed only if we have read a letter sequence
                 // in both modes MERGE and REPLACE, we process the transduction if any
                 traiter_transduction(p,etiq->output);
                 if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                    // if we are in MERGE mode, we add to ouput the char we have read
                     push_output_string(p->stack,mot);
                 }
                 scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p);
              }
         }
         else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MAJ_LN5))) {
              // case of upper case letter sequence
              if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');}
              //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);}
              else pos2=pos;
              unichar* mot=mot_token_buffer;
              int position=0;
              if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION ||
                  ((pos2+p->current_origin)==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) {
                 while (pos2+p->current_origin<p->text_buffer->size && is_upper(p->buffer[pos2+p->current_origin],p->alphabet)) {
                    mot[position++]=p->buffer[(pos2++)+p->current_origin];
                 }
                 mot[position]='\0';
                 if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) {
                   // we proceed only if we have read an upper case letter sequence
                   // which is not followed by a lower case letter
                   // in both modes MERGE and REPLACE, we process the transduction if any
                   traiter_transduction(p,etiq->output);
                   if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                     // if we are in MERGE mode, we add to ouput the char we have read
                     push_input_string(p->stack,mot,0);
                   }
                   scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p);
                 }
              }
         }
         else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MIN_LN5))) {
              // case of lower case letter sequence
              if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');}
              //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);}
              else pos2=pos;
              unichar* mot=mot_token_buffer;
              int position=0;
              if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION ||
                  (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) {
                 while (pos2+p->current_origin<p->text_buffer->size && is_lower(p->buffer[pos2+p->current_origin],p->alphabet)) {
                    mot[position++]=p->buffer[(pos2++)+p->current_origin];
                 }
                 mot[position]='\0';
                 if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) {
                   // we proceed only if we have read a lower case letter sequence
                   // which is not followed by an upper case letter
                   // in both modes MERGE and REPLACE, we process the transduction if any
                   traiter_transduction(p,etiq->output);
                   if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                     // if we are in MERGE mode, we add to ouput the char we have read
                     push_input_string(p->stack,mot,0);
                   }
                   scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p);
                 }
              }
         }
         else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_PRE_LN5))) {
              // case of a sequence beginning by an upper case letter
              if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');}
              //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);}
              else pos2=pos;
              unichar* mot=mot_token_buffer;
              int position=0;
              if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION ||
                  (is_upper(p->buffer[pos2+p->current_origin],p->alphabet) && (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet)))) {
                 while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) {
                    mot[position++]=p->buffer[(pos2++)+p->current_origin];
                 }
                 mot[position]='\0';
                 if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) {
                   // we proceed only if we have read a letter sequence
                   // which is not followed by a letter
                   // in both modes MERGE and REPLACE, we process the transduction if any
                   traiter_transduction(p,etiq->output);
                   if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                     // if we are in MERGE mode, we add to ouput the char we have read
                     push_input_string(p->stack,mot,0);
                   }
                   scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p);
                 }
              }
         }
         else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_PNC_LN5))) {
              // case of a punctuation sequence
              if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');}
              //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);}
              else pos2=pos;
              unichar C=p->buffer[pos2+p->current_origin];
              if (C==';' || C=='!' || C=='?' ||
                  C==':' ||  C==0xbf ||
                  C==0xa1 || C==0x0e4f || C==0x0e5a ||
                  C==0x0e5b || C==0x3001 || C==0x3002 ||
                  C==0x30fb) {
                 // in both modes MERGE and REPLACE, we process the transduction if any
                 traiter_transduction(p,etiq->output);
                 if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                    // if we are in MERGE mode, we add to ouput the char we have read
                    push(p->stack,C);
                 }
                 scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p);
              }
              else {
                   // we consider the case of ...
                   // BUG: if ... appears at the end of the buffer
                   if (C=='.') {
                      if ((pos2+p->current_origin+2)<p->text_buffer->size && p->buffer[pos2+p->current_origin+1]=='.' && p->buffer[pos2+p->current_origin+2]=='.') {
                         traiter_transduction(p,etiq->output);
                         if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                            // if we are in MERGE mode, we add to ouput the ... we have read
                            push(p->stack,C);push(p->stack,C);push(p->stack,C);
                         }
                         scan_graph(n_graph,t->state_number,pos2+3,depth,liste_arrivee,mot_token_buffer,p);
                      } else {
                        // we consider the . as a normal punctuation sign
                        traiter_transduction(p,etiq->output);
                        if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                          // if we are in MERGE mode, we add to ouput the char we have read
                          push(p->stack,C);
                        }
                        scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p);
                      }
                   }
              }
         }
         else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast3(contenu,ETIQ_E_LN3))) {
              // case of an empty sequence
              // in both modes MERGE and REPLACE, we process the transduction if any
              traiter_transduction(p,etiq->output);
              scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p);
         }
         else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast3(contenu,ETIQ_CIRC_LN3))) {
              // case of a new line sequence
              if (p->buffer[pos+p->current_origin]=='\n') {
                 // in both modes MERGE and REPLACE, we process the transduction if any
                 traiter_transduction(p,etiq->output);
                 if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                    // if we are in MERGE mode, we add to ouput the char we have read
                    push(p->stack,'\n');
                 }
                 scan_graph(n_graph,t->state_number,pos+1,depth,liste_arrivee,mot_token_buffer,p);
              }
         }
         else if ((contenu_len_possible_match==1) && (!u_trymatch_superfast1(contenu,'#')) && (!(etiq->control&RESPECT_CASE_TAG_BIT_MASK))) {
              // case of a no space condition
              if (p->buffer[pos+p->current_origin]!=' ') {
                // in both modes MERGE and REPLACE, we process the transduction if any
                traiter_transduction(p,etiq->output);
                scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p);
              }
         }
         else if ((contenu_len_possible_match==1) && (!u_trymatch_superfast1(contenu,' '))) {
         // case of an obligatory space
              if (p->buffer[pos+p->current_origin]==' ') {
                // in both modes MERGE and REPLACE, we process the transduction if any
                traiter_transduction(p,etiq->output);
                 if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                    // if we are in MERGE mode, we add to ouput the char we have read
                    push(p->stack,' ');
                 }
                scan_graph(n_graph,t->state_number,pos+1,depth,liste_arrivee,mot_token_buffer,p);
              }
         }
         else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast5(contenu,ETIQ_L_LN3))) {
              // case of a single letter
              if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');}
              //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);}
              else pos2=pos;
              if (is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) {
                // in both modes MERGE and REPLACE, we process the transduction if any
                traiter_transduction(p,etiq->output);
                 if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                    // if we are in MERGE mode, we add to ouput the char we have read
                    push(p->stack,p->buffer[pos2+p->current_origin]);
                 }
                scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p);
              }
         }
         else {
              // case of a normal letter sequence
              if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');}
              //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);}
              else pos2=pos;
              if (etiq->control&RESPECT_CASE_TAG_BIT_MASK) {
                 // case of exact case match
                 int position=0;
                 while (pos2+p->current_origin<p->text_buffer->size && p->buffer[pos2+p->current_origin]==contenu[position]) {
                   pos2++; position++;
                 }
                 if (contenu[position]=='\0' && position!=0 &&
                     !(is_letter(contenu[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet))) {
                   // we proceed only if we have exactly read the contenu sequence
                   // in both modes MERGE and REPLACE, we process the transduction if any
                   traiter_transduction(p,etiq->output);
                   if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                     // if we are in MERGE mode, we add to ouput the char we have read
                     push_input_string(p->stack,contenu,0);
                   }
                   scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p);
                 }
              }
              else {
                 // case of variable case match
                 // the letter sequences may have been caught by the arbre_etiquette structure
                 int position=0;
                 unichar* mot=mot_token_buffer;
                 while (pos2+p->current_origin<p->text_buffer->size && is_equal_or_uppercase(contenu[position],p->buffer[pos2+p->current_origin],p->alphabet)) {
                   mot[position++]=p->buffer[(pos2++)+p->current_origin];
                 }
                 mot[position]='\0';
                 if (contenu[position]=='\0' && position!=0 &&
                     !(is_letter(contenu[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet))) {
                   // we proceed only if we have exactly read the contenu sequence
                   // in both modes MERGE and REPLACE, we process the transduction if any
                   traiter_transduction(p,etiq->output);
                   if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) {
                     // if we are in MERGE mode, we add to ouput the char we have read
                     push_input_string(p->stack,mot,0);
                   }
                   scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p);
                 }
              }
         }
      }
      t=t->next;
}
}
Example #26
0
/////////////////////////////////////////////////////////////////////////////////
// Inflect a DELAS/DELAC into a DELAF/DELACF.
// On error returns 1, 0 otherwise.
int inflect(char* DLC, char* DLCF, 
		    MultiFlex_ctx* p_multiFlex_ctx, struct l_morpho_t* pL_MORPHO, Alphabet* alph,
		    Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input,
		    int config_files_status,
		    d_class_equiv_T* D_CLASS_EQUIV, int error_check_status,
		    Korean* korean,const char* pkgdir) {
	U_FILE *dlc, *dlcf; //DELAS/DELAC and DELAF/DELACF files
	unichar input_line[DIC_LINE_SIZE]; //current DELAS/DELAC line
	unichar output_line[DIC_LINE_SIZE]; //current DELAF/DELACF line
	int l; //length of the line scanned
	DLC_entry_T* dlc_entry;
	MU_forms_T MU_forms; //inflected forms of the MWU
	int err;

	//Open DELAS/DELAC
	dlc = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input, DLC, U_READ);
	if (!dlc) {
		return 1;
	}
	//Open DELAF/DELACF
	dlcf = u_fopen_creating_versatile_encoding(encoding_output, bom_output, DLCF, U_WRITE);
	if (!dlcf) {
		error("Unable to open file: '%s' !\n", DLCF);
		return 1;
	}
	//Inflect one entry at a time
	l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc);
	//Omit the final newline
	u_chomp_new_line(input_line);
	int flag = 0;
	//If a line is empty the file is not necessarily finished.
	//If the last entry has no newline, we should not skip this entry
	struct dela_entry* DELAS_entry;
	int semitic;
	int current_line=0;
	while (l != EOF) {
	   current_line++;
		DELAS_entry = is_strict_DELAS_line(input_line, alph);
		if (DELAS_entry != NULL) {
			/* If we have a strict DELAS line, that is to say, one with
			 * a simple word */
			if (error_check_status==ONLY_COMPOUND_WORDS) {
				error("Unexpected simple word forbidden by -c:\n%S\n",input_line);
				free_dela_entry(DELAS_entry);
				goto next_line;
			}
			SU_forms_T forms;
			SU_init_forms(&forms); //Allocate the space for forms and initialize it to null values
			char inflection_code[1024];
			unichar code_gramm[1024];
			/* We take the first grammatical code, and we extract from it the name
			 * of the inflection transducer to use */
			get_inflection_code(DELAS_entry->semantic_codes[0],
					inflection_code, code_gramm, &semitic);
			/* And we inflect the word */
			//   err=SU_inflect(DELAS_entry->lemma,inflection_code,&forms,semitic);
			err = SU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output,mask_encoding_compatibility_input,DELAS_entry->lemma, inflection_code,
					DELAS_entry->filters, &forms, semitic, korean,pkgdir);
#ifdef __GNUC__
#warning mettre toutes les entrees sur une meme ligne
#elif ((defined(__VISUALC__)) || defined(_MSC_VER))
#pragma message("warning : mettre toutes les entrees sur une meme ligne")
#endif
			/* Then, we print its inflected forms to the output */
			for (int i = 0; i < forms.no_forms; i++) {
			   
			   unichar foo[1024];   
			   if (korean!=NULL) {
			      Hanguls_to_Jamos(forms.forms[i].form,foo,korean,1);
			   } else {
			      u_strcpy(foo,forms.forms[i].form);
			   }
			   
			   u_fprintf(dlcf, "%S,%S.%S", foo/*forms.forms[i].form*/,
						DELAS_entry->lemma, code_gramm);
				/* We add the semantic codes, if any */
				for (int j = 1; j < DELAS_entry->n_semantic_codes; j++) {
					u_fprintf(dlcf, "+%S", DELAS_entry->semantic_codes[j]);
				}
				if (forms.forms[i].local_semantic_code != NULL) {
					u_fprintf(dlcf, "%S", forms.forms[i].local_semantic_code);
				}
				if (forms.forms[i].raw_features != NULL
						&& forms.forms[i].raw_features[0] != '\0') {
					u_fprintf(dlcf, ":%S", forms.forms[i].raw_features);
				}
				u_fprintf(dlcf, "\n");
			}
			SU_delete_inflection(&forms);
			free_dela_entry(DELAS_entry);
			/* End of simple word case */
		} else {
			/* If we have not a simple word DELAS line, we try to analyse it
			 * as a compound word DELAC line */
			if (error_check_status==ONLY_SIMPLE_WORDS) {
				error("Unexpected compound word forbidden by -s:\n%S\n",input_line);
				goto next_line;
			}
			if (config_files_status != CONFIG_FILES_ERROR) {
				/* If this is a compound word, we process it if and only if the
				 * configuration files have been correctly loaded */
				dlc_entry = (DLC_entry_T*) malloc(sizeof(DLC_entry_T));
				if (!dlc_entry) {
					fatal_alloc_error("inflect");
				}
				/* Convert a DELAC entry into the internal multi-word format */
				err = DLC_line2entry(alph,pL_MORPHO,input_line, dlc_entry, D_CLASS_EQUIV);
				if (!err) {
					//Inflect the entry
					MU_init_forms(&MU_forms);
					err = MU_inflect(p_multiFlex_ctx,pL_MORPHO,encoding_output,bom_output,
							mask_encoding_compatibility_input,dlc_entry->lemma, &MU_forms,pkgdir);
					if (!err) {
						int f; //index of the current inflected form
						//Inform the user if no form generated
						if (MU_forms.no_forms == 0) {
							error("No inflected form could be generated for ");
							DLC_print_entry(pL_MORPHO,dlc_entry);
						}
						//Print inflected forms
						for (f = 0; f < MU_forms.no_forms; f++) {
							//Format the inflected form to the DELACF format
							err = DLC_format_form(pL_MORPHO,output_line, DIC_LINE_SIZE
									- 1, MU_forms.forms[f], dlc_entry,
									D_CLASS_EQUIV);
							if (!err) {
								//Print one inflected form at a time to the DELACF file
								u_fprintf(dlcf, "%S\n", output_line);
							}
						}
					}
					MU_delete_inflection(&MU_forms);
					DLC_delete_entry(dlc_entry);
				}
			} else {
				/* We try to inflect a compound word whereas the "Morphology.txt" and/or
				 * "Equivalences.txt" file(s) has/have not been loaded */
				if (!flag) {
					/* We use a flag to print the error message only once */
					error(
							"WARNING: Compound words won't be inflected because configuration files\n");
					error("         have not been correctly loaded.\n");
					flag = 1;
				}
			}
		}
		next_line:
		//Get next entry
		l = u_fgets(input_line, DIC_LINE_SIZE - 1, dlc);
		if (l!=EOF) {
			//Omit the final newline
			u_chomp_new_line(input_line);
			if (input_line[0]=='\0') {
				/* If we find an empty line, then we go on */
				goto next_line;
			}
		}
	}
	u_fclose(dlc);
	u_fclose(dlcf);
	return 0;
}
Example #27
0
/////////////////////////////////////////////////////////////////////////////////
// Converts a DELAC line ('line') into a structured DELAC entry ('entry').
// 'line' is non terminated by a newline.
// Initially, entry has its space allocated but is empty.
// Returns 1 if 'line' is empty, 2 if its format is incorrect, -1 if memory allocation problems, 0 otherwise.
int DLC_line2entry(Alphabet* alph,struct l_morpho_t* pL_MORPHO,unichar* line, DLC_entry_T* entry,
		d_class_equiv_T* D_CLASS_EQUIV) {
	int l; //length of the scanned sequence
	int pos; //index of the next character to be read
	SU_id_T* unit;

	pos = 0;
	if (!line[pos]) //Empty line
		return 1;

	//Initalize the lemma
	entry->lemma = (MU_lemma_T*) malloc(sizeof(MU_lemma_T));
	if (!entry->lemma) {
		fatal_alloc_error("DLC_line2entry");
	}
	entry->lemma->no_units = 0;

	//Scan the single units
	while (line[pos] && line[pos] != (unichar) ',') { //Each DELAC line must contain a comma
		unit = (SU_id_T*) malloc(sizeof(SU_id_T));
		if (!unit) {
			fatal_alloc_error("DLC_line2entry");
		}
		l = DLC_scan_unit(alph,pL_MORPHO,unit, &(line[pos]), D_CLASS_EQUIV);
		if (l <= 0) {
			free(unit);
			MU_delete_lemma(entry->lemma);
			return 2;
		}
		entry->lemma->units[entry->lemma->no_units] = unit;
		entry->lemma->no_units++;
		pos += l;
	}

	if (line[pos] != (unichar) ',') {
		error("Comma missing in DELAC line:\n%S\n", line);
		MU_delete_lemma(entry->lemma);
		return 2;
	}

	//Scan the inflection paradigm
	unichar tmp[DIC_LINE_SIZE];
	pos++; //Omit the comma
	l = u_scan_until_char(tmp, &(line[pos]), DIC_LINE_SIZE - 1, "+:)\\/", 1);
	pos += l;
	if (!l) {
		error("Inflection paradigm inexistent in line:\n%S\n", line);
		MU_delete_lemma(entry->lemma);
		return 2;
	}
	entry->lemma->paradigm = (char*) malloc((u_strlen(tmp) + 1) * sizeof(char));
	if (!entry->lemma->paradigm) {
		fatal_alloc_error("DLC_line2entry");
	}
	for (unsigned int c = 0; c <= u_strlen(tmp); c++) //Convert to char and copy
		entry->lemma->paradigm[c] = (char) tmp[c];

	//Determine the class (e.g. noun)
	l_class_T* cl;
	cl = DLC_class_para(tmp, D_CLASS_EQUIV);
	if (!cl) {
		error(
				"Impossible to deduce the compound's inflection class (noun, adj, etc.):\n%S\n",
				line);
		MU_delete_lemma(entry->lemma);
		return 2;
	}
	entry->lemma->cl = cl;

	//Scan the semantic codes
	l = DLC_scan_codes(entry->codes, &(line[pos]));
	pos += l;

	//Scan the comment
	l = DLC_scan_comment(&(entry->comment), &(line[pos]));
	pos += l;

	if (line[pos]) {
		error("Bad format in DELAC line:\n%S\n", line);
		MU_delete_lemma(entry->lemma); //delete lemma
		for (int c = 0; entry->codes[c]; c++) //delete codes
			free(entry->codes[c]);
		free(entry->comment); //delete comment
		return 2;
	}
	return 0;
}
Example #28
0
/**
 * Allocates, initializes and returns a new locate_parameters structure.
 */
struct locate_parameters* new_locate_parameters() {
    struct locate_parameters* p=(struct locate_parameters*)malloc(sizeof(struct locate_parameters));
    if (p==NULL) {
        fatal_alloc_error("new_locate_parameters");
    }
    p->tilde_negation_operator=1;
    p->useLocateCache=1;
    p->token_control=NULL;
    p->matching_patterns=NULL;
    p->current_compound_pattern=0;
    p->pattern_tree_root=NULL;
    /* We use -1 because there may be no space, {S} or {STOP} in the text */
    p->SPACE=-1;
    p->SENTENCE=-1;
    p->STOP=-1;
    p->tag_token_list=NULL;
#ifdef TRE_WCHAR
    p->filters=NULL;
    p->filter_match_index=NULL;
#endif
    p->DLC_tree=NULL;
    p->optimized_states=NULL;
    p->fst2=NULL;
    p->tokens=NULL;
    p->current_origin=-1;
    p->max_count_call=0;
    p->max_count_call_warning=0;
    p->buffer=NULL;
    p->tokenization_policy=WORD_BY_WORD_TOKENIZATION;
    p->space_policy=DONT_START_WITH_SPACE;
    p->matching_units=0;
    p->match_policy=LONGEST_MATCHES;
    p->output_policy=IGNORE_OUTPUTS;
    p->ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS;
    p->variable_error_policy=IGNORE_VARIABLE_ERRORS;
    p->match_list=NULL;
    p->number_of_matches=0;
    p->number_of_outputs=0;
    p->start_position_last_printed_match=-1;
    p->end_position_last_printed_match=-1;
    p->search_limit=0;
    p->input_variables=NULL;
    p->output_variables=NULL;
    p->nb_output_variables=0;
    p->stack=new_stack_unichar(TRANSDUCTION_STACK_SIZE);
    p->alphabet=NULL;
    p->morpho_dic_inf=NULL;
    p->morpho_dic_inf_free=NULL;
    p->morpho_dic_bin=NULL;
    p->morpho_dic_bin_free=NULL;
    p->n_morpho_dics=0;
    p->dic_variables=NULL;
    p->left_ctx_shift=0;
    p->left_ctx_base=0;
    p->protect_dic_chars=0;
    p->graph_depth=0;
    p->korean=NULL;
    p->jamo_tags=NULL;
    p->mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
    p->recyclable_wchart_buffer=(wchar_t*)malloc(sizeof(wchar_t)*SIZE_RECYCLABLE_WCHAR_T_BUFFER);
    if (p->recyclable_wchart_buffer==NULL) {
        fatal_alloc_error("new_locate_parameters");
    }
    p->recyclable_unichar_buffer=(unichar*)malloc(sizeof(unichar)*SIZE_RECYCLABLE_UNICHAR_BUFFER);
    if (p->recyclable_unichar_buffer==NULL) {
        fatal_alloc_error("new_locate_parameters");
    }
    p->size_recyclable_unichar_buffer = SIZE_RECYCLABLE_UNICHAR_BUFFER;
    p->failfast=NULL;
    p->match_cache_first=NULL;
    p->match_cache_last=NULL;
    p->match_cache=NULL;
    p->prv_alloc=NULL;
    p->prv_alloc_recycle=NULL;
    p->token_error_ctx.last_length=0;
    p->token_error_ctx.last_start=0;
    p->token_error_ctx.n_errors=0;
    p->token_error_ctx.n_matches_at_token_pos__locate=0;
    p->token_error_ctx.n_matches_at_token_pos__morphological_locate=0;
    p->counting_step.count_call=0;
    p->counting_step.count_cancel_trying=0;
    p->explore_depth=0;
    p->backup_memory_reserve=NULL;
    p->cached_match_vector=new_vector_ptr(16);
    p->fnc_locate_trace_step=NULL;
    p->private_param_locate_trace=NULL;
    memset(&(p->arabic),0,sizeof(ArabicTypoRules));
    p->is_in_cancel_state = 0;
    p->is_in_trace_state = 0;
    p->counting_step_count_cancel_trying_real_in_debug_or_trace = 0;
    return p;
}
Example #29
0
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err,
                   const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy,
                   Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input,
                   const char* dynamicDir,TokenizationPolicy tokenization_policy,
                   SpacePolicy space_policy,int search_limit,const char* morpho_dic_list,
                   AmbiguousOutputPolicy ambiguous_output_policy,
                   VariableErrorPolicy variable_error_policy,int protect_dic_chars,
                   int is_korean,int max_count_call,int max_count_call_warning,
                   char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) {

    U_FILE* out;
    U_FILE* info;
    struct locate_parameters* p=new_locate_parameters();
    p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0);
    p->buffer=(int*)af_get_mapfile_pointer(p->text_cod);
    long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int);
    p->buffer_size=(int)text_size;
    p->tilde_negation_operator=tilde_negation_operator;
    p->useLocateCache=useLocateCache;
    if (max_count_call == -1) {
        max_count_call = (int)text_size;
    }
    if (max_count_call_warning == -1) {
        max_count_call_warning = (int)text_size;
    }
    p->match_policy=match_policy;
    p->tokenization_policy=tokenization_policy;
    p->space_policy=space_policy;
    p->output_policy=output_policy;
    p->search_limit=search_limit;
    p->ambiguous_output_policy=ambiguous_output_policy;
    p->variable_error_policy=variable_error_policy;
    p->protect_dic_chars=protect_dic_chars;
    p->mask_encoding_compatibility_input = mask_encoding_compatibility_input;
    p->max_count_call = max_count_call;
    p->max_count_call_warning = max_count_call_warning;
    p->token_filename = tokens;
    char concord[FILENAME_MAX];
    char concord_info[FILENAME_MAX];

    strcpy(concord,dynamicDir);
    strcat(concord,"concord.ind");

    strcpy(concord_info,dynamicDir);
    strcat(concord_info,"concord.n");

    char morpho_bin[FILENAME_MAX];
    strcpy(morpho_bin,dynamicDir);
    strcat(morpho_bin,"morpho.bin");
    if (arabic_rules!=NULL && arabic_rules[0]!='\0') {
        load_arabic_typo_rules(arabic_rules,&(p->arabic));
    }
    out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE);
    if (out==NULL) {
        error("Cannot write %s\n",concord);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        u_fclose(out);
        return 0;
    }
    info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE);
    if (info==NULL) {
        error("Cannot write %s\n",concord_info);
    }
    switch(output_policy) {
    case IGNORE_OUTPUTS:
        u_fprintf(out,"#I\n");
        break;
    case MERGE_OUTPUTS:
        u_fprintf(out,"#M\n");
        break;
    case REPLACE_OUTPUTS:
        u_fprintf(out,"#R\n");
        break;
    }
    if (alphabet!=NULL && alphabet[0]!='\0') {
        u_printf("Loading alphabet...\n");
        p->alphabet=load_alphabet(alphabet,is_korean);
        if (p->alphabet==NULL) {
            error("Cannot load alphabet file %s\n",alphabet);
            af_release_mapfile_pointer(p->text_cod,p->buffer);
            af_close_mapfile(p->text_cod);
            free_stack_unichar(p->stack);
            free_locate_parameters(p);
            if (info!=NULL) u_fclose(info);
            u_fclose(out);
            return 0;
        }
    }
    struct string_hash* semantic_codes=new_string_hash();
    extract_semantic_codes(dlf,semantic_codes);
    extract_semantic_codes(dlc,semantic_codes);

    if (is_cancelling_requested() != 0) {
        error("user cancel request.\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    u_printf("Loading fst2...\n");
    struct FST2_free_info fst2load_free;
    Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free);
    if (fst2load==NULL) {
        error("Cannot load grammar %s\n",fst2_name);
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered);


    p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator);
    free_abstract_Fst2(fst2load,&fst2load_free);

    if (is_cancelling_requested() != 0) {
        error("User cancel request..\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    p->tags=p->fst2->tags;
#ifdef TRE_WCHAR
    p->filters=new_FilterSet(p->fst2,p->alphabet);
    if (p->filters==NULL) {
        error("Cannot compile filter(s)\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
#endif
    u_printf("Loading token list...\n");
    int n_text_tokens=0;

    p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens);
    if (p->tokens==NULL) {
        error("Cannot load token list %s\n",tokens);
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
    Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator;

    p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator);
    memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache));
    if (p->match_cache==NULL) {
        fatal_alloc_error("locate_pattern");
    }

#ifdef TRE_WCHAR
    p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens);
    if (p->filter_match_index==NULL) {
        error("Cannot optimize filter(s)\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_string_hash(p->tokens);
        close_abstract_allocator(locate_abstract_allocator);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
#endif

    if (allow_trace!=0) {
        open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace);
    }
    extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator);
    u_printf("Loading morphological dictionaries...\n");
    load_morphological_dictionaries(morpho_dic_list,p,morpho_bin);
    extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator);
    p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char));
    if (p->token_control==NULL) {
        fatal_alloc_error("locate_pattern");
    }
    p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*));
    if (p->matching_patterns==NULL) {
        fatal_alloc_error("locate_pattern");
    }
    for (int i=0; i<n_text_tokens; i++) {
        p->token_control[i]=0;
        p->matching_patterns[i]=NULL;
    }
    compute_token_controls(p->alphabet,err,p);
    int number_of_patterns,is_DIC,is_CDIC,is_SDIC;
    p->pattern_tree_root=new_pattern_node(locate_abstract_allocator);
    u_printf("Computing fst2 tags...\n");
    process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator);
    p->current_compound_pattern=number_of_patterns;
    p->DLC_tree=new_DLC_tree(p->tokens->size);
    struct lemma_node* root=new_lemma_node();
    u_printf("Loading dlf...\n");
    load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p);
    u_printf("Loading dlc...\n");
    load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p);
    /* We look if tag tokens like "{today,.ADV}" verify some patterns */
    check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator);
    u_printf("Optimizing fst2 pattern tags...\n");
    optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator);
    u_printf("Optimizing compound word dictionary...\n");
    optimize_DLC(p->DLC_tree);
    free_string_hash(semantic_codes);
    int nb_input_variable=0;
    p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable);
    p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables);


    Abstract_allocator locate_recycle_abstract_allocator=NULL;
    locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle",
                                      AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject,
                                      get_prefered_allocator_item_size_for_nb_variable(nb_input_variable));

    u_printf("Optimizing fst2...\n");
    p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator);
    if (is_korean) {
        p->korean=new Korean(p->alphabet);
        p->jamo_tags=create_jamo_tags(p->korean,p->tokens);
    }
    p->failfast=new_bit_array(n_text_tokens,ONE_BIT);

    u_printf("Working...\n");
    p->prv_alloc=locate_work_abstract_allocator;
    p->prv_alloc_recycle=locate_recycle_abstract_allocator;
    launch_locate(out,text_size,info,p);
    if (allow_trace!=0) {
        close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace);
    }
    free_bit_array(p->failfast);
    free_Variables(p->input_variables);
    free_OutputVariables(p->output_variables);
    af_release_mapfile_pointer(p->text_cod,p->buffer);
    af_close_mapfile(p->text_cod);
    if (info!=NULL) u_fclose(info);
    u_fclose(out);

    if (p->match_cache!=NULL) {
        for (int i=0; i<p->tokens->size; i++) {
            free_LocateCache(p->match_cache[i],locate_work_abstract_allocator);
        }
        free_cb(p->match_cache,locate_work_abstract_allocator);
    }
    int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1;

    if (free_abstract_allocator_item) {
        free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator);
    }
    free_stack_unichar(p->stack);
    /** Too long to free the DLC tree if it is big
     * free_DLC_tree(p->DLC_tree);
     */
    if (free_abstract_allocator_item) {
        free_pattern_node(p->pattern_tree_root,locate_abstract_allocator);
        free_Fst2(p->fst2,locate_abstract_allocator);
        free_list_int(p->tag_token_list,locate_abstract_allocator);
    }
    close_abstract_allocator(locate_abstract_allocator);
    close_abstract_allocator(locate_recycle_abstract_allocator);
    locate_recycle_abstract_allocator=locate_abstract_allocator=NULL;

    /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */
    free_alphabet(p->alphabet);
    if (p->korean!=NULL) {
        delete p->korean;
    }
    if (p->jamo_tags!=NULL) {
        /* jamo tags must be freed before tokens, because we need to know how
         * many jamo tags there are, and this number is the number of tokens */
        for (int i=0; i<p->tokens->size; i++) {
            free(p->jamo_tags[i]);
        }
        free(p->jamo_tags);
    }
    free_string_hash(p->tokens);
    free_lemma_node(root);
    free(p->token_control);
    for (int i=0; i<n_text_tokens; i++) {
        free_bit_array(p->matching_patterns[i]);
    }
    free(p->matching_patterns);
#ifdef TRE_WCHAR
    free_FilterSet(p->filters);
    free_FilterMatchIndex(p->filter_match_index);
#endif
    for (int i=0; i<p->n_morpho_dics; i++) {
        free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i]));
        free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i]));
    }
    free(p->morpho_dic_inf);
    free(p->morpho_dic_inf_free);
    free(p->morpho_dic_bin);
    free(p->morpho_dic_bin_free);
#if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT))
    free_DLC_tree(p->DLC_tree);
#endif
    free_locate_parameters(p);
    u_printf("Done.\n");
    return 1;
}
Example #30
0
/**
 * This function minimizes the given automaton. Note
 * that it must be deterministic. For more information,
 * see comments in this library's .h file.
 */
void elag_minimize(SingleGraph automaton,int level) {
struct list_int* initials=get_initial_states(automaton);
if (initials==NULL) {
   /* No initial state should mean 'empty automaton' */
   if (automaton->number_of_states!=0) {
      /* If not, we fail */
      fatal_error("No initial state in non empty automaton in elag_minimize\n");
   }
   return;
}
if (initials->next!=NULL) {
   fatal_error("Non-deterministic automaton in elag_minimize\n");
}
free_list_int(initials);
if (level>0) {
   /* If necessary, we remove transitions that are included in the
    * default ones */
   compact_default_transitions(automaton);
}
SymbolAlphabet* alph=build_symbol_alphabet(automaton);
TransitionCollection** transitions=build_transition_collections(automaton,alph);
/* Now that we have numbered transitions, we don't need the symbol
 * alphabet anymore */
free_SymbolAlphabet(alph);
int nbColors;
int nbShades;
int* color=(int*)calloc(automaton->number_of_states,sizeof(int));
if (color==NULL) {
   fatal_alloc_error("elag_minimize");
}
int* shade=init_colors(automaton,&nbShades);
do {
   int s;
   /* We copy the shades into the color array */
   for (s=0;s<automaton->number_of_states;s++) {
      color[s]=shade[s];
   }
   nbColors=nbShades;
   nbShades=0;
   /* We update the colors of the transitions' destination states */
   update_colors(transitions,color,automaton->number_of_states);
   /* Now, for each state #s, we look for its shade, comparing it with
    * all the states #i so that i<s */
   for (s=0;s<automaton->number_of_states;s++) {
      shade[s]=get_shade(s,transitions,color,shade,&nbShades);
   }
   /* We stop when no more shades have been introduced */
} while (nbColors!=nbShades);
int* chosen=choose_states(color,nbColors,automaton->number_of_states);
for (int i=0;i<automaton->number_of_states;i++) {
   free_TransitionCollection(transitions[i]);
}
free(transitions);
free(shade);
/* We allocate the resulting automaton */
SingleGraph result=new_SingleGraph(nbColors,PTR_TAGS);
for (int c=0;c<nbColors;c++) {
   SingleGraphState state=add_state(result);
   SingleGraphState original=automaton->states[chosen[c]];
   /* We set the initiality and finality of the state */
   state->control=original->control;
   state->outgoing_transitions=original->outgoing_transitions;
   original->outgoing_transitions=NULL;
   /* We renumber the transitions' destination states */
   for (Transition* t1=state->outgoing_transitions;t1!=NULL;t1=t1->next) {
      t1->state_number=color[t1->state_number];
   }
   state->default_state=original->default_state;
}
/* Now we have to replace the old automaton by the new one */
move_SingleGraph(automaton,&result,free_symbol);
/* And we don't need these arrays anymore */
free(color);
free(chosen);
}