Пример #1
0
/**
 * For each token of the text, we compute its associated control byte.
 * We use the unknown word file 'err' in order to determine if a token
 * must be matched by <!DIC>
 */
void compute_token_controls(Alphabet* alph,const char* err,struct locate_parameters* p) {
    struct string_hash* ERR=load_key_list(err,p->mask_encoding_compatibility_input);
    int n=p->tokens->size;
    for (int i=0; i<n; i++) {
        p->token_control[i]=get_control_byte(p->tokens->value[i],alph,ERR,p->tokenization_policy);
    }
    free_string_hash(ERR);
}
/**
 * Frees all the memory asociated to the given token tree structure.
 */
void free_fst2txt_token_tree(struct fst2txt_token_tree* t, Abstract_allocator prv_alloc) {
if (t==NULL) return;
free_string_hash(t->hash);
for (int i=0;i<t->size;i++) {
   free_Transition_list(t->transition_array[i], prv_alloc);
}
free_cb(t->transition_array,prv_alloc);
free_cb(t,prv_alloc);
}
Пример #3
0
/**
 * Frees all the memory associated to the given DELA tree.
 */
void free_DELA_tree(struct DELA_tree* tree) {
if (tree==NULL) return;
free_string_hash(tree->inflected_forms);
for (int i=0;i<tree->size;i++) {
   free_dela_entry_list(tree->dela_entries[i]);
}
free(tree->dela_entries);
free(tree);
}
Пример #4
0
/**
 * Saves the labels of the given .fst2, closes the file
 * and frees the associated memory.
 */
void fst_file_close_out(Elag_fst_file_out* fstout) {
write_fst_tags(fstout);
fseek(fstout->f,fstout->fstart,SEEK_SET);
/* We print the number of automata on 10 digits */
u_fprintf(fstout->f,"%010d",fstout->nb_automata);
u_fclose(fstout->f);
free_string_hash(fstout->labels);
if (fstout->name!=NULL) free(fstout->name);
free(fstout);
}
Пример #5
0
/**
 * This function frees the given string_hash_ptr, using 'free_' (if not NULL)
 * to free the elements of the 'value' array.
 */
void free_string_hash_ptr(struct string_hash_ptr* s,void (*free_)(void*)) {
if (s==NULL) return;
/* If necessary, we free the 'value' array */
if (free_!=NULL) {
   for (int i=0;i<s->hash->size;i++) {
      free_(s->value[i]);
   }
}
free(s->value);
free_string_hash(s->hash);
free(s);
}
/**
 * This function constructs and returns a token tree from a normalization grammar.
 * Tokens are represented by integers.
 */
struct normalization_tree* load_normalization_fst2(const VersatileEncodingConfig* vec,const char* grammar,
		const Alphabet* alph,struct text_tokens* tok) {
struct FST2_free_info fst2_free;
Fst2* fst2=load_abstract_fst2(vec,grammar,0,&fst2_free);
if (fst2==NULL) {
   return NULL;
}
struct string_hash* hash=new_string_hash(DONT_USE_VALUES);
/* We create the token tree to speed up the consultation */
for (int i=0;i<tok->N;i++) {
   get_value_index(tok->token[i],hash);
}
struct normalization_tree* root=new_normalization_tree();
explore_normalization_fst2(fst2,fst2->initial_states[1],root,hash,U_EMPTY,alph,NULL);
free_abstract_Fst2(fst2,&fst2_free);
free_string_hash(hash);
return root;
}
Пример #7
0
int main(void)
{
	string_tree t;
	string_hash h;
	it_string_tree itt;
	it_string_hash ith;

	if (!new_string_tree(&t)) {
		fprintf(stderr, "Error allocating tree.\n");
		exit(1);
	}
	if (!new_string_hash(&h)) {
		fprintf(stderr, "Error allocating hash.\n");
		exit(1);
	}

	insert_string_tree(&t, "cat");
	insert_string_tree(&t, "dog");
	insert_string_tree(&t, "mouse");

	insert_string_hash(&h, "cat");
	insert_string_hash(&h, "dog");
	insert_string_hash(&h, "mouse");

	itt = get_string_tree(&t, "dog");
	if (!itt)
		printf("Dog not found.\n");
	else
		printf("%s barks woof!\n", itt->value);

	ith = get_string_hash(&h, "cat");
	if (!ith)
		printf("Cat not found.\n");
	else
		printf("%s say meeow!\n", ith->value);

	free_string_tree(&t);
	free_string_hash(&h);
	return 0;
}
Пример #8
0
int main_PolyLex(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int language=-1;
char alphabet[FILENAME_MAX]="";
char name_bin[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char info[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_PolyLex,lopts_PolyLex,&index))) {
   switch(val) {
   case 'D': language=DUTCH; break;
   case 'G': language=GERMAN; break;
   case 'N': language=NORWEGIAN; break;
   case 'R': language=RUSSIAN; break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty dictionary file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(name_bin,options.vars()->optarg);
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'i': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty information file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(info,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_PolyLex[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (name_bin[0]=='\0') {
   error("You must specify the .bin dictionary to use\n");
   return USAGE_ERROR_CODE;
}

if (output[0]=='\0') {
   error("You must specify the output dictionary file name\n");
   return USAGE_ERROR_CODE;
}

if (language==-1) {
   error("You must specify the language\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   u_printf("Loading alphabet...\n");
   alph=load_alphabet(&vec,alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      return USAGE_ERROR_CODE;
   }
}

char name_inf[FILENAME_MAX];
struct string_hash* forbiddenWords=NULL;
if (language==DUTCH || language==NORWEGIAN) {
   get_path(name_bin,name_inf);
   strcat(name_inf,"ForbiddenWords.txt");
   forbiddenWords=load_key_list(&vec,name_inf);
   if (forbiddenWords==NULL) {
       /* If there was no file, we don't want to block the process */
       forbiddenWords=new_string_hash(DONT_USE_VALUES);
   }
}

strcpy(name_inf,name_bin);
name_inf[strlen(name_bin)-3]='\0';
strcat(name_inf,"inf");
Dictionary* d=new_Dictionary(&vec,name_bin,name_inf);
if (d==NULL) {
    error("Cannot load dictionary %s\n",name_bin);
    free_string_hash(forbiddenWords);
    free_alphabet(alph);
    return DEFAULT_ERROR_CODE;
}

char tmp[FILENAME_MAX];
strcpy(tmp,argv[options.vars()->optind]);
strcat(tmp,".tmp");

U_FILE* words=u_fopen(&vec,argv[options.vars()->optind],U_READ);
if (words==NULL) {
   error("Cannot open word list file %s\n",argv[options.vars()->optind]);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   // here we return 0 in order to do not block the preprocessing
   // in the Unitex/GramLab IDE interface, if no dictionary was applied
   // so that there is no "err" file
   return SUCCESS_RETURN_CODE;
}

U_FILE* new_unknown_words=u_fopen(&vec,tmp,U_WRITE);
if (new_unknown_words==NULL) {
   error("Cannot open temporary word list file %s\n",tmp);
   u_fclose(words);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   return DEFAULT_ERROR_CODE;
}

U_FILE* res=u_fopen(&vec,output,U_APPEND);
if (res==NULL) {
   error("Cannot open result file %s\n",output);
   u_fclose(new_unknown_words);
   u_fclose(words);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   u_fclose(words);
   return DEFAULT_ERROR_CODE;
}

U_FILE* debug=NULL;
if ((*info)!='\0') {
   debug=u_fopen(&vec,info,U_WRITE);
   if (debug==NULL) {
      error("Cannot open debug file %s\n",info);
   }
}
struct utags UTAG;

switch(language) {
  case DUTCH:
    analyse_dutch_unknown_words(alph,
                                d,
                                words,
                                res,
                                debug,
                                new_unknown_words,
                                forbiddenWords);
    break;
  case GERMAN:
    analyse_german_compounds(alph,
                             d,
                             words,
                             res,
                             debug,
                             new_unknown_words);
    break;
  case NORWEGIAN:
    analyse_norwegian_unknown_words(alph,
                                    d,
                                    words,
                                    res,
                                    debug,
                                    new_unknown_words,
                                    forbiddenWords);
    break;
  case RUSSIAN:
     init_russian(&UTAG);
     analyse_compounds(alph,
                       d,
                       words,
                       res,
                       debug,
                       new_unknown_words,
                       UTAG);
     break;
}

free_alphabet(alph);
free_Dictionary(d);
u_fclose(words);
u_fclose(new_unknown_words);
free_string_hash(forbiddenWords);
af_remove(argv[options.vars()->optind]);
af_rename(tmp,argv[options.vars()->optind]);
u_fclose(res);

if (debug!=NULL) {
   u_fclose(debug);
}

return SUCCESS_RETURN_CODE;
}
Пример #9
0
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err,
                   const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy,
                   Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input,
                   const char* dynamicDir,TokenizationPolicy tokenization_policy,
                   SpacePolicy space_policy,int search_limit,const char* morpho_dic_list,
                   AmbiguousOutputPolicy ambiguous_output_policy,
                   VariableErrorPolicy variable_error_policy,int protect_dic_chars,
                   int is_korean,int max_count_call,int max_count_call_warning,
                   char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) {

    U_FILE* out;
    U_FILE* info;
    struct locate_parameters* p=new_locate_parameters();
    p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0);
    p->buffer=(int*)af_get_mapfile_pointer(p->text_cod);
    long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int);
    p->buffer_size=(int)text_size;
    p->tilde_negation_operator=tilde_negation_operator;
    p->useLocateCache=useLocateCache;
    if (max_count_call == -1) {
        max_count_call = (int)text_size;
    }
    if (max_count_call_warning == -1) {
        max_count_call_warning = (int)text_size;
    }
    p->match_policy=match_policy;
    p->tokenization_policy=tokenization_policy;
    p->space_policy=space_policy;
    p->output_policy=output_policy;
    p->search_limit=search_limit;
    p->ambiguous_output_policy=ambiguous_output_policy;
    p->variable_error_policy=variable_error_policy;
    p->protect_dic_chars=protect_dic_chars;
    p->mask_encoding_compatibility_input = mask_encoding_compatibility_input;
    p->max_count_call = max_count_call;
    p->max_count_call_warning = max_count_call_warning;
    p->token_filename = tokens;
    char concord[FILENAME_MAX];
    char concord_info[FILENAME_MAX];

    strcpy(concord,dynamicDir);
    strcat(concord,"concord.ind");

    strcpy(concord_info,dynamicDir);
    strcat(concord_info,"concord.n");

    char morpho_bin[FILENAME_MAX];
    strcpy(morpho_bin,dynamicDir);
    strcat(morpho_bin,"morpho.bin");
    if (arabic_rules!=NULL && arabic_rules[0]!='\0') {
        load_arabic_typo_rules(arabic_rules,&(p->arabic));
    }
    out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE);
    if (out==NULL) {
        error("Cannot write %s\n",concord);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        u_fclose(out);
        return 0;
    }
    info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE);
    if (info==NULL) {
        error("Cannot write %s\n",concord_info);
    }
    switch(output_policy) {
    case IGNORE_OUTPUTS:
        u_fprintf(out,"#I\n");
        break;
    case MERGE_OUTPUTS:
        u_fprintf(out,"#M\n");
        break;
    case REPLACE_OUTPUTS:
        u_fprintf(out,"#R\n");
        break;
    }
    if (alphabet!=NULL && alphabet[0]!='\0') {
        u_printf("Loading alphabet...\n");
        p->alphabet=load_alphabet(alphabet,is_korean);
        if (p->alphabet==NULL) {
            error("Cannot load alphabet file %s\n",alphabet);
            af_release_mapfile_pointer(p->text_cod,p->buffer);
            af_close_mapfile(p->text_cod);
            free_stack_unichar(p->stack);
            free_locate_parameters(p);
            if (info!=NULL) u_fclose(info);
            u_fclose(out);
            return 0;
        }
    }
    struct string_hash* semantic_codes=new_string_hash();
    extract_semantic_codes(dlf,semantic_codes);
    extract_semantic_codes(dlc,semantic_codes);

    if (is_cancelling_requested() != 0) {
        error("user cancel request.\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    u_printf("Loading fst2...\n");
    struct FST2_free_info fst2load_free;
    Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free);
    if (fst2load==NULL) {
        error("Cannot load grammar %s\n",fst2_name);
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered);


    p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator);
    free_abstract_Fst2(fst2load,&fst2load_free);

    if (is_cancelling_requested() != 0) {
        error("User cancel request..\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    p->tags=p->fst2->tags;
#ifdef TRE_WCHAR
    p->filters=new_FilterSet(p->fst2,p->alphabet);
    if (p->filters==NULL) {
        error("Cannot compile filter(s)\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
#endif
    u_printf("Loading token list...\n");
    int n_text_tokens=0;

    p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens);
    if (p->tokens==NULL) {
        error("Cannot load token list %s\n",tokens);
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
    Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator;

    p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator);
    memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache));
    if (p->match_cache==NULL) {
        fatal_alloc_error("locate_pattern");
    }

#ifdef TRE_WCHAR
    p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens);
    if (p->filter_match_index==NULL) {
        error("Cannot optimize filter(s)\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_string_hash(p->tokens);
        close_abstract_allocator(locate_abstract_allocator);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
#endif

    if (allow_trace!=0) {
        open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace);
    }
    extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator);
    u_printf("Loading morphological dictionaries...\n");
    load_morphological_dictionaries(morpho_dic_list,p,morpho_bin);
    extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator);
    p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char));
    if (p->token_control==NULL) {
        fatal_alloc_error("locate_pattern");
    }
    p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*));
    if (p->matching_patterns==NULL) {
        fatal_alloc_error("locate_pattern");
    }
    for (int i=0; i<n_text_tokens; i++) {
        p->token_control[i]=0;
        p->matching_patterns[i]=NULL;
    }
    compute_token_controls(p->alphabet,err,p);
    int number_of_patterns,is_DIC,is_CDIC,is_SDIC;
    p->pattern_tree_root=new_pattern_node(locate_abstract_allocator);
    u_printf("Computing fst2 tags...\n");
    process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator);
    p->current_compound_pattern=number_of_patterns;
    p->DLC_tree=new_DLC_tree(p->tokens->size);
    struct lemma_node* root=new_lemma_node();
    u_printf("Loading dlf...\n");
    load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p);
    u_printf("Loading dlc...\n");
    load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p);
    /* We look if tag tokens like "{today,.ADV}" verify some patterns */
    check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator);
    u_printf("Optimizing fst2 pattern tags...\n");
    optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator);
    u_printf("Optimizing compound word dictionary...\n");
    optimize_DLC(p->DLC_tree);
    free_string_hash(semantic_codes);
    int nb_input_variable=0;
    p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable);
    p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables);


    Abstract_allocator locate_recycle_abstract_allocator=NULL;
    locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle",
                                      AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject,
                                      get_prefered_allocator_item_size_for_nb_variable(nb_input_variable));

    u_printf("Optimizing fst2...\n");
    p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator);
    if (is_korean) {
        p->korean=new Korean(p->alphabet);
        p->jamo_tags=create_jamo_tags(p->korean,p->tokens);
    }
    p->failfast=new_bit_array(n_text_tokens,ONE_BIT);

    u_printf("Working...\n");
    p->prv_alloc=locate_work_abstract_allocator;
    p->prv_alloc_recycle=locate_recycle_abstract_allocator;
    launch_locate(out,text_size,info,p);
    if (allow_trace!=0) {
        close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace);
    }
    free_bit_array(p->failfast);
    free_Variables(p->input_variables);
    free_OutputVariables(p->output_variables);
    af_release_mapfile_pointer(p->text_cod,p->buffer);
    af_close_mapfile(p->text_cod);
    if (info!=NULL) u_fclose(info);
    u_fclose(out);

    if (p->match_cache!=NULL) {
        for (int i=0; i<p->tokens->size; i++) {
            free_LocateCache(p->match_cache[i],locate_work_abstract_allocator);
        }
        free_cb(p->match_cache,locate_work_abstract_allocator);
    }
    int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1;

    if (free_abstract_allocator_item) {
        free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator);
    }
    free_stack_unichar(p->stack);
    /** Too long to free the DLC tree if it is big
     * free_DLC_tree(p->DLC_tree);
     */
    if (free_abstract_allocator_item) {
        free_pattern_node(p->pattern_tree_root,locate_abstract_allocator);
        free_Fst2(p->fst2,locate_abstract_allocator);
        free_list_int(p->tag_token_list,locate_abstract_allocator);
    }
    close_abstract_allocator(locate_abstract_allocator);
    close_abstract_allocator(locate_recycle_abstract_allocator);
    locate_recycle_abstract_allocator=locate_abstract_allocator=NULL;

    /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */
    free_alphabet(p->alphabet);
    if (p->korean!=NULL) {
        delete p->korean;
    }
    if (p->jamo_tags!=NULL) {
        /* jamo tags must be freed before tokens, because we need to know how
         * many jamo tags there are, and this number is the number of tokens */
        for (int i=0; i<p->tokens->size; i++) {
            free(p->jamo_tags[i]);
        }
        free(p->jamo_tags);
    }
    free_string_hash(p->tokens);
    free_lemma_node(root);
    free(p->token_control);
    for (int i=0; i<n_text_tokens; i++) {
        free_bit_array(p->matching_patterns[i]);
    }
    free(p->matching_patterns);
#ifdef TRE_WCHAR
    free_FilterSet(p->filters);
    free_FilterMatchIndex(p->filter_match_index);
#endif
    for (int i=0; i<p->n_morpho_dics; i++) {
        free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i]));
        free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i]));
    }
    free(p->morpho_dic_inf);
    free(p->morpho_dic_inf_free);
    free(p->morpho_dic_bin);
    free(p->morpho_dic_bin_free);
#if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT))
    free_DLC_tree(p->DLC_tree);
#endif
    free_locate_parameters(p);
    u_printf("Done.\n");
    return 1;
}
Пример #10
0
int main_CheckDic(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}

int is_a_DELAF=-1;
int strict_unprotected=0;
int skip_path=0;
char alph[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
int space_warnings=1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_CheckDic,lopts_CheckDic,&index,vars))) {
   switch(val) {
   case 'f': is_a_DELAF=1; break;
   case 's': is_a_DELAF=0; break;
   case 'h': usage(); return 0;
   case 'r': strict_unprotected=1; break;
   case 't': strict_unprotected=0; break;
   case 'n': space_warnings=0; break;
   case 'p': skip_path=1; break;
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("Empty alphabet argument\n");
             }
             strcpy(alph,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_CheckDic[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (is_a_DELAF==-1 || vars->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return 1;
}

U_FILE* dic=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (dic==NULL) {
	fatal_error("Cannot open dictionary %s\n",argv[vars->optind]);
}
Alphabet* alphabet0=NULL;
if (alph[0]!='\0') {
   alphabet0=load_alphabet(alph,1);
}
char output_filename[FILENAME_MAX];
get_path(argv[vars->optind],output_filename);
strcat(output_filename,"CHECK_DIC.TXT");
U_FILE* out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output_filename,U_WRITE);
if (out==NULL) {
	u_fclose(dic);
	fatal_error("Cannot create %s\n",output_filename);
}
u_printf("Checking %s...\n",argv[vars->optind]);
unichar line[CHECKDIC_LINE_SIZE];
int line_number=1;
/*
 * We declare and initialize an array in order to know which
 * letters are used in the dictionary.
 */
int i;
char* alphabet=(char*)malloc(sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS);
if (alphabet==NULL) {
	fatal_alloc_error("CheckDic's main");
}
memset(alphabet,0,sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS);
/*
 * We use two structures for the storage of the codes found in the
 * dictionary. Note that 'semantic_codes' is used to store both grammatical and
 * semantic codes.
 */
struct string_hash* semantic_codes=new_string_hash();
struct string_hash* inflectional_codes=new_string_hash();
struct string_hash* simple_lemmas=new_string_hash(DONT_USE_VALUES);
struct string_hash* compound_lemmas=new_string_hash(DONT_USE_VALUES);
int n_simple_entries=0;
int n_compound_entries=0;
/*
 * We read all the lines and check them.
 */
while (EOF!=u_fgets_limit2(line,DIC_LINE_SIZE,dic)) {
   if (line[0]=='\0') {
		/* If we have an empty line, we print a unicode error message
		 * into the output file */
		u_fprintf(out,"Line %d: empty line\n",line_number);
	}
	else if (line[0]=='/') {
		/* If a line starts with '/', it is a commment line, so
		 * we ignore it */
	}
	else {
		/* If we have a line to check, we check it according to the
		 * dictionary type */
		check_DELA_line(line,out,is_a_DELAF,line_number,alphabet,semantic_codes,
		                inflectional_codes,simple_lemmas,compound_lemmas,
		                &n_simple_entries,&n_compound_entries,alphabet0,strict_unprotected);
	}
	/* At regular intervals, we display a message on the standard
	 * output to show that the program is working */
	if (line_number%10000==0) {
		u_printf("%d lines read...\r",line_number);
	}
	line_number++;
}
u_printf("%d lines read\n",line_number-1);
u_fclose(dic);
/*
 * Once we have checked all the lines, we print some informations
 * in the output file.
 */
u_fprintf(out,"-----------------------------------\n");
u_fprintf(out,"-------------  Stats  -------------\n");
u_fprintf(out,"-----------------------------------\n");
if (skip_path != 0) { 
    char filename_without_path[FILENAME_MAX];
    remove_path(argv[vars->optind],filename_without_path);
    u_fprintf(out,"File: %s\n",filename_without_path);
}
else {
    u_fprintf(out,"File: %s\n",argv[vars->optind]);
}
u_fprintf(out,"Type: %s\n",is_a_DELAF?"DELAF":"DELAS");
u_fprintf(out,"%d line%s read\n",line_number-1,(line_number-1>1)?"s":"");
u_fprintf(out,"%d simple entr%s ",n_simple_entries,(n_simple_entries>1)?"ies":"y");
u_fprintf(out,"for %d distinct lemma%s\n",simple_lemmas->size,(simple_lemmas->size>1)?"s":"");
u_fprintf(out,"%d compound entr%s ",n_compound_entries,(n_compound_entries>1)?"ies":"y");
u_fprintf(out,"for %d distinct lemma%s\n",compound_lemmas->size,(compound_lemmas->size>1)?"s":"");
/**
 * We print the list of the characters that are used, with
 * their unicode numbers shown in hexadecimal. This can be useful
 * to detect different characters that are graphically identical
 * like 'A' (upper of latin 'a' or upper of greek alpha ?).
 */
u_fprintf(out,"-----------------------------------\n");
u_fprintf(out,"----  All chars used in forms  ----\n");
u_fprintf(out,"-----------------------------------\n");
unichar r[4];
unichar r2[7];
r[1]=' ';
r[2]='(';
r[3]='\0';
r2[5]='\n';
r2[6]='\0';
for (i=0;i<MAX_NUMBER_OF_UNICODE_CHARS;i++) {
	if (alphabet[i]) {
      u_fprintf(out,"%C (%04X)\n",i,i);
	}
}
/*
 * Then we print the list of all grammatical and semantic codes used in the
 * dictionary. If a code contains a non ASCII character, a space or a tabulation,
 * we print a warning.
 */
u_fprintf(out,"-------------------------------------------------------------\n");
u_fprintf(out,"----  %3d grammatical/semantic code%s",semantic_codes->size,(semantic_codes->size>1)?"s used in dictionary  ----\n":" used in dictionary  -----\n");
u_fprintf(out,"-------------------------------------------------------------\n");
unichar comment[2000];
for (i=0;i<semantic_codes->size;i++) {
	/* We print the code, followed if necessary by a warning */
	u_fprintf(out,"%S",semantic_codes->value[i]);
	if (warning_on_code(semantic_codes->value[i],comment,space_warnings)) {
		u_fprintf(out," %S",comment);
	}
	u_fprintf(out,"\n");
}
/*
 * Finally, we print the list of inflectional codes,
 * with warnings in the case of non ASCII letters, spaces
 * or tabulations.
 */
u_fprintf(out,"-----------------------------------------------------\n");
u_fprintf(out,"----  %3d inflectional code%s",inflectional_codes->size,(inflectional_codes->size>1)?"s used in dictionary  ----\n":" used in dictionary  -----\n");
u_fprintf(out,"-----------------------------------------------------\n");


for (i=0;i<inflectional_codes->size;i++) {
	u_fprintf(out,"%S",inflectional_codes->value[i]);
	if (warning_on_code(inflectional_codes->value[i],comment,space_warnings)) {
		u_fprintf(out," %S",comment);
	}
	u_fprintf(out,"\n");
}
u_fclose(out);
free_OptVars(vars);
u_printf("Done.\n");
/* Note that we don't free anything since it would only waste time */

free(alphabet);
if (alphabet0!=NULL) {
   free_alphabet(alphabet0);
}
#if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT))
/* cleanup for no leak on library */
free_string_hash(semantic_codes);
free_string_hash(inflectional_codes);
free_string_hash(simple_lemmas);
free_string_hash(compound_lemmas);
#endif
return 0;
}
Пример #11
0
/**
 * The same than main, but no call to setBufferMode.
 */
int main_KeyWords(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;

char tokens[FILENAME_MAX];
char output[FILENAME_MAX]="";
char alph[FILENAME_MAX]="";
char cdic[FILENAME_MAX]="";
unichar* code=u_strdup("XXX");
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_KeyWords,lopts_KeyWords,&index))) {
   switch(val) {
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output\n");
                free(code);
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             strcpy(alph,options.vars()->optarg);
             break;
   case 'f': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty forbidden code\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             free(code);
             code=u_strdup(options.vars()->optarg);
             break;
   case 'c': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty file name\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             strcpy(cdic,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             free(code);
             return SUCCESS_RETURN_CODE;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_KeyWords[index].name);
             free(code);
             return USAGE_ERROR_CODE;
             break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free(code);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind==argc || options.vars()->optind==argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free(code);
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free(code);
  return SUCCESS_RETURN_CODE;
}

Alphabet* alphabet=NULL;
if (alph[0]!='\0') {
  alphabet=load_alphabet(&vec,alph);
  if (alphabet==NULL) {
    error("Cannot load alphabet file %s\n",alph);
    free(code);
    return DEFAULT_ERROR_CODE;    
  }
}

strcpy(tokens,argv[(options.vars()->optind++)]);
if (output[0]=='\0') {
	get_path(tokens,output);
	strcat(output,"keywords.txt");
}
struct string_hash_ptr* keywords=load_tokens_by_freq(tokens,&vec);
filter_non_letter_keywords(keywords,alphabet);
if (cdic[0]!='\0') {
	load_compound_words(cdic,&vec,keywords);
}

for (;options.vars()->optind!=argc;(options.vars()->optind)++) {
	filter_keywords_with_dic(keywords,argv[options.vars()->optind],&vec,alphabet);
}
merge_case_equivalent_unknown_words(keywords,alphabet);
struct string_hash* forbidden_lemmas=compute_forbidden_lemmas(keywords,code);
remove_keywords_with_forbidden_lemma(keywords,forbidden_lemmas);
free_string_hash(forbidden_lemmas);
vector_ptr* sorted=sort_keywords(keywords);
U_FILE* f_output=u_fopen(&vec,output,U_WRITE);

if (f_output==NULL) {
	error("Cannot write in file %s\n",output);
  free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list);
  free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list);
  free_alphabet(alphabet);
  free(code);
  return DEFAULT_ERROR_CODE;  
}

dump_keywords(sorted,f_output);

u_fclose(f_output);
free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list);
free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list);
free_alphabet(alphabet);
free(code);

return SUCCESS_RETURN_CODE;
}
Пример #12
0
int main_PolyLex(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}


int language=-1;
char alphabet[FILENAME_MAX]="";
char dictionary[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char info[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_PolyLex,lopts_PolyLex,&index,vars))) {
   switch(val) {
   case 'D': language=DUTCH; break;
   case 'G': language=GERMAN; break;
   case 'N': language=NORWEGIAN; break;
   case 'R': language=RUSSIAN; break;
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty alphabet file name\n");
             }
             strcpy(alphabet,vars->optarg);
             break;
   case 'd': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty dictionary file name\n");
             }
             strcpy(dictionary,vars->optarg);
             break;
   case 'o': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty output file name\n");
             }
             strcpy(output,vars->optarg);
             break;
   case 'i': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty information file name\n");
             }
             strcpy(info,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_PolyLex[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}

if (dictionary[0]=='\0') {
   fatal_error("You must specify the .bin dictionary to use\n");
}
if (output[0]=='\0') {
   fatal_error("You must specify the output dictionary file name\n");
}
if (language==-1) {
   fatal_error("You must specify the language\n");
}

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   u_printf("Loading alphabet...\n");
   alph=load_alphabet(alphabet);
   if (alph==NULL) {
      fatal_error("Cannot load alphabet file %s\n",alphabet);
   }
}
char temp[FILENAME_MAX];
struct string_hash* forbiddenWords=NULL;
if (language==DUTCH || language==NORWEGIAN) {
   get_path(dictionary,temp);
   strcat(temp,"ForbiddenWords.txt");
   forbiddenWords=load_key_list(temp,mask_encoding_compatibility_input);
}
u_printf("Loading BIN file...\n");
struct BIN_free_info bin_free;
const unsigned char* bin=load_abstract_BIN_file(dictionary,&bin_free);
if (bin==NULL) {
   error("Cannot load bin file %s\n",dictionary);
   free_alphabet(alph);
   free_string_hash(forbiddenWords);
   return 1;
}
strcpy(temp,dictionary);
temp[strlen(dictionary)-3]='\0';
strcat(temp,"inf");
u_printf("Loading INF file...\n");
struct INF_free_info inf_free;
const struct INF_codes* inf=load_abstract_INF_file(temp,&inf_free);
if (inf==NULL) {
   error("Cannot load inf file %s\n",temp);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_string_hash(forbiddenWords);
   return 1;
}
char tmp[FILENAME_MAX];
strcpy(tmp,argv[vars->optind]);
strcat(tmp,".tmp");
U_FILE* words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (words==NULL) {
   error("Cannot open word list file %s\n",argv[vars->optind]);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   free_string_hash(forbiddenWords);
   // here we return 0 in order to do not block the preprocessing
   // in the Unitex Java interface, if no dictionary was applied
   // so that there is no "err" file
   return 0;
}
U_FILE* new_unknown_words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,tmp,U_WRITE);
if (new_unknown_words==NULL) {
   error("Cannot open temporary word list file %s\n",tmp);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   u_fclose(words);
   free_string_hash(forbiddenWords);
   return 1;
}

U_FILE* res=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output,U_APPEND);
if (res==NULL) {
   error("Cannot open result file %s\n",output);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   u_fclose(words);
   u_fclose(new_unknown_words);
   free_string_hash(forbiddenWords);
   return 1;
}
U_FILE* debug=NULL;
if (info!=NULL) {
   debug=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,info,U_WRITE);
   if (debug==NULL) {
      error("Cannot open debug file %s\n",info);
   }
}
struct utags UTAG;

switch(language) {
case DUTCH: analyse_dutch_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break;
case GERMAN: analyse_german_compounds(alph,bin,inf,words,res,debug,new_unknown_words); break;
case NORWEGIAN: analyse_norwegian_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break;
case RUSSIAN:
   init_russian(&UTAG);
   analyse_compounds(alph,bin,inf,words,res,debug,new_unknown_words,UTAG);
   break;
}

free_alphabet(alph);
free_abstract_BIN(bin,&bin_free);
free_abstract_INF(inf,&inf_free);
u_fclose(words);
u_fclose(new_unknown_words);
free_string_hash(forbiddenWords);
af_remove(argv[vars->optind]);
af_rename(tmp,argv[vars->optind]);
u_fclose(res);
if (debug!=NULL) {
   u_fclose(debug);
}
free_OptVars(vars);
return 0;
}
Пример #13
0
/**
 * This function produces a normalized version of 'input' and stores it into 'ouput'.
 * The following rules are applied in the given order:
 *
 * 1) If there is a { at the current position, we try to read a {S}, a {STOP} or
 *    a tag token like {today,.ADV}. If we fail, we replace the { and the }, if any,
 *    according to the replacement rules. Otherwise, we let the token unchanged.
 * 2) If there is one or more replacement rules that can apply to the current
 *    position in 'input', then we apply the longest one.
 * 3) If we we find a separator (space, tab, new line) sequence, we replace it:
 *    - by a new line if the sequence contains one and if 'carriage_return_policy' is
 *      set to KEEP_CARRIAGE_RETURN;
 *    - by a space otherwise.
 * 4) We copy the character that was read to the output.
 *
 * Note that 'replacements' is supposed to contain replacement rules for { and }
 */
int normalize(const char *fin, const char *fout, 
              Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input,
              int carriage_return_policy, const char *rules) {
	U_FILE* input;
	input = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,fin,U_READ);
	if (input == NULL) {
		error("Cannot open file %s\n", fin);
		return 1;
	}

	U_FILE* output;
	output = u_fopen_creating_versatile_encoding(encoding_output,bom_output,fout,U_WRITE);
	if (output == NULL) {
		error("Cannot create file %s\n", fout);
		u_fclose(input);
		return 1;
	}

	struct string_hash* replacements=NULL;
	if(rules != NULL && rules[0]!='\0') {
		replacements=load_key_value_list(rules,mask_encoding_compatibility_input,'\t');
		if (replacements==NULL) {
		   error("Cannot load replacement rules file %s\n", rules);
		   replacements=new_string_hash();
		}
	}
	/* If there is no replacement rules file, we simulate one */
	else {
	   replacements=new_string_hash();
	}

	/* If there is a replacement rule file, we ensure that there are replacement
	 * rules for { and }. If not, we add our default ones, so that in any case,
	 * we are sure to have rules for { and } */
	unichar key[2];
	unichar value[2];
	u_strcpy(key,"{");
	u_strcpy(value,"[");
	get_value_index(key,replacements,INSERT_IF_NEEDED,value);
	u_strcpy(key,"}");
	u_strcpy(value,"]");
	get_value_index(key,replacements,INSERT_IF_NEEDED,value);

    struct OUTBUF OutBuf;
    OutBuf.pos=0;
	unichar tmp[MAX_TAG_LENGTH];
	//struct buffer* buffer=new_buffer_for_file(UNICHAR_BUFFER,input);

    long save_pos=ftell(input);
    fseek(input,0,SEEK_END);
    long file_size_input=ftell(input);
    fseek(input,save_pos,SEEK_SET);

    int line_buffer_size = (int)(((file_size_input+1) < MAX_LINE_BUFFER_SIZE) ? (file_size_input+1) : MAX_LINE_BUFFER_SIZE);

    unichar *line_read;
    line_read=(unichar*)malloc((line_buffer_size+0x10)*sizeof(unichar));
    if (line_read==NULL) {
        fatal_alloc_error("normalize");
    }

	/* We define some things that will be used for parsing the buffer */


    static const unichar stop_chars[]= { '{', '}', 0 };
    static const unichar forbidden_chars[]= { '\n', 0 };
    static const unichar open_bracket[]= { '{', 0 };
    static const unichar close_bracket[]= { '}', 0 };
    static const unichar empty_string[]= { 0 };

   int corrupted_file=0;
   int eof_found=0;
   /* First, we fill the buffer */
	
    int lastline_was_terminated=0;

    while (eof_found==0) {
        int current_start_pos=0;
        int found_null=0;
        const unichar*buff=line_read;
        int result_read = 0;

        result_read = u_fgets_treat_cr_as_lf(line_read,line_buffer_size,input,1,&found_null);
        if ((found_null != 0) && (corrupted_file==0)) {
          corrupted_file=1;
          error("Corrupted text file containing NULL characters!\n");
          error("They have been ignored by Normalize, but you should clean your text\n");
        }

        if (result_read>0)
            if (line_read[result_read-1]==0x0d)
                line_read[result_read-1]='\n';
        
        if (result_read==EOF)
            break;

        if (lastline_was_terminated != 0)
            while (current_start_pos<result_read) {
                if (buff[current_start_pos]!=' ' && buff[current_start_pos]!='\t'
							    && buff[current_start_pos]!=0x0d
                                && buff[current_start_pos]!='\n')
                                break;
                current_start_pos++;
            }

        lastline_was_terminated = 0;
        if (result_read > 0)
            if ((buff[result_read-1]=='\n') || (buff[result_read-1]==0x0d))
                lastline_was_terminated = 1;


        while (current_start_pos<result_read) {
            if ((lastline_was_terminated == 0) && (eof_found == 0) && 
                (current_start_pos + MINIMAL_CHAR_IN_BUFFER_BEFORE_CONTINUE_LINE >= result_read))
            {
                int i;
                int nb_to_keep = result_read-current_start_pos;
                for (i=0;i<nb_to_keep;i++)
                    line_read[i]=line_read[current_start_pos+i];
                int found_null_read=0;
                int result_read_continue = u_fgets_treat_cr_as_lf(line_read+nb_to_keep,line_buffer_size-nb_to_keep,input,1,&found_null_read);

                if ((found_null_read != 0) && (corrupted_file==0)) {
                    corrupted_file=1;
                    error("Corrupted text file containing NULL characters!\n");
                    error("They have been ignored by Normalize, but you should clean your text\n");
                }

                if (result_read_continue>0)
                    if (line_read[(result_read_continue+nb_to_keep)-1]==0x0d)
                        line_read[(result_read_continue+nb_to_keep)-1]='\n';
                lastline_was_terminated = 0;
                if (result_read_continue==EOF)
                    eof_found = lastline_was_terminated = 1;

                if (result_read_continue > 0)
                    if ((buff[(result_read_continue+nb_to_keep)-1]=='\n') || (buff[(result_read_continue+nb_to_keep)-1]==0x0d))
                        lastline_was_terminated = 1;

                result_read = nb_to_keep;
                current_start_pos = 0;

                if (result_read_continue > 0)
                    result_read += result_read_continue;
            }

		if (buff[current_start_pos]=='{') {
			/* If we have a {, we try to find a sequence like {....}, that does not contain
			 * new lines. If the sequence contains protected character, we want to keep them
			 * protected. */
			int old_position=current_start_pos;
			/* If we don't increase the position, the parse will stop on the initial { */
			current_start_pos++;
			tmp[0]='{';
			int code=parse_string(buff,&current_start_pos,&(tmp[1]),stop_chars,forbidden_chars,NULL);
			if (code==P_FORBIDDEN_CHAR || code==P_BACKSLASH_AT_END || buff[current_start_pos]!='}') {
				/* If we have found a new line or a {, or if there is
				 * a backslash at the end of the buffer, or if we have reached the end
				 * of the buffer, we assume that the initial
				 * { was not a tag beginning, so we print the substitute of { */
				WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0);
				/* And we rewind the current position after the { */
				current_start_pos=old_position+1;
			}
			else {
				/* If we have read a sequence like {....}, we assume that there won't be
				 * a buffer overflow if we add the } */
				u_strcat(tmp,close_bracket);
				if (!u_strcmp(tmp,"{S}") || !u_strcmp(tmp,"{STOP}") || check_tag_token(tmp)) {
					/* If this is a special tag or a valid tag token, we just print
					 * it to the output */
					WriteOufBuf(&OutBuf,tmp,output, 0);
					current_start_pos++;
				}
				else {
					/* If we have a non valid tag token, we print the equivalent of {
					 * and we rewind the current position after the { */
					WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0);
					current_start_pos=old_position+1;
				}
			}
		}
		else {
			/* If we have a character that is not {, first we try to look if there
			 * is a replacement to do */
			int key_length;
			int index=get_longest_key_index(&buff[current_start_pos],&key_length,replacements);
			if (index!=NO_VALUE_INDEX) {
				/* If there is something to replace */
				WriteOufBuf(&OutBuf,replacements->value[index],output, 0);
				current_start_pos=current_start_pos+key_length;
			}
			else {
				if (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
					/* If we have a separator, we try to read the longest separator sequence
					 * that we can read. By the way, we note if it contains a new line */
					int new_line=0;
					while (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t'
							|| buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
						/* Note 1: no bound check is needed, since an unichar buffer is always
						 *        ended by a \0
						 *
						 * Note 2: we don't take into account the case of a buffer ended by
						 *         separator while it's not the end of file: that would mean
						 *         that the text contains something like MARGIN_BEFORE_BUFFER_END
						 *         contiguous separators. Such a text would not be a reasonable one.
						 */
						if (buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) {
							new_line=1;
						}
						current_start_pos++;
					}
					if (new_line && (carriage_return_policy==KEEP_CARRIAGE_RETURN)) {
						/* We print a new line if the sequence contains one and if we are
						 * allowed to; otherwise, we print a space. */
						WriteOufBuf(&OutBuf,'\n',output, 0);
					}
					else {
						WriteOufBuf(&OutBuf,' ',output, 0);
					}
				}
				else {
					/* If, finally, we have a normal character to normalize, we just print it */
                    WriteOufBuf(&OutBuf,buff[current_start_pos++],output, 0);
				}
			}
		}
	    }
    }


    WriteOufBuf(&OutBuf,empty_string,output, 1);

	free(line_read);
	free_string_hash(replacements);

	u_fclose(input);
	u_fclose(output);
	return 0;
}