/** * For each token of the text, we compute its associated control byte. * We use the unknown word file 'err' in order to determine if a token * must be matched by <!DIC> */ void compute_token_controls(Alphabet* alph,const char* err,struct locate_parameters* p) { struct string_hash* ERR=load_key_list(err,p->mask_encoding_compatibility_input); int n=p->tokens->size; for (int i=0; i<n; i++) { p->token_control[i]=get_control_byte(p->tokens->value[i],alph,ERR,p->tokenization_policy); } free_string_hash(ERR); }
/** * Frees all the memory asociated to the given token tree structure. */ void free_fst2txt_token_tree(struct fst2txt_token_tree* t, Abstract_allocator prv_alloc) { if (t==NULL) return; free_string_hash(t->hash); for (int i=0;i<t->size;i++) { free_Transition_list(t->transition_array[i], prv_alloc); } free_cb(t->transition_array,prv_alloc); free_cb(t,prv_alloc); }
/** * Frees all the memory associated to the given DELA tree. */ void free_DELA_tree(struct DELA_tree* tree) { if (tree==NULL) return; free_string_hash(tree->inflected_forms); for (int i=0;i<tree->size;i++) { free_dela_entry_list(tree->dela_entries[i]); } free(tree->dela_entries); free(tree); }
/** * Saves the labels of the given .fst2, closes the file * and frees the associated memory. */ void fst_file_close_out(Elag_fst_file_out* fstout) { write_fst_tags(fstout); fseek(fstout->f,fstout->fstart,SEEK_SET); /* We print the number of automata on 10 digits */ u_fprintf(fstout->f,"%010d",fstout->nb_automata); u_fclose(fstout->f); free_string_hash(fstout->labels); if (fstout->name!=NULL) free(fstout->name); free(fstout); }
/** * This function frees the given string_hash_ptr, using 'free_' (if not NULL) * to free the elements of the 'value' array. */ void free_string_hash_ptr(struct string_hash_ptr* s,void (*free_)(void*)) { if (s==NULL) return; /* If necessary, we free the 'value' array */ if (free_!=NULL) { for (int i=0;i<s->hash->size;i++) { free_(s->value[i]); } } free(s->value); free_string_hash(s->hash); free(s); }
/** * This function constructs and returns a token tree from a normalization grammar. * Tokens are represented by integers. */ struct normalization_tree* load_normalization_fst2(const VersatileEncodingConfig* vec,const char* grammar, const Alphabet* alph,struct text_tokens* tok) { struct FST2_free_info fst2_free; Fst2* fst2=load_abstract_fst2(vec,grammar,0,&fst2_free); if (fst2==NULL) { return NULL; } struct string_hash* hash=new_string_hash(DONT_USE_VALUES); /* We create the token tree to speed up the consultation */ for (int i=0;i<tok->N;i++) { get_value_index(tok->token[i],hash); } struct normalization_tree* root=new_normalization_tree(); explore_normalization_fst2(fst2,fst2->initial_states[1],root,hash,U_EMPTY,alph,NULL); free_abstract_Fst2(fst2,&fst2_free); free_string_hash(hash); return root; }
int main(void) { string_tree t; string_hash h; it_string_tree itt; it_string_hash ith; if (!new_string_tree(&t)) { fprintf(stderr, "Error allocating tree.\n"); exit(1); } if (!new_string_hash(&h)) { fprintf(stderr, "Error allocating hash.\n"); exit(1); } insert_string_tree(&t, "cat"); insert_string_tree(&t, "dog"); insert_string_tree(&t, "mouse"); insert_string_hash(&h, "cat"); insert_string_hash(&h, "dog"); insert_string_hash(&h, "mouse"); itt = get_string_tree(&t, "dog"); if (!itt) printf("Dog not found.\n"); else printf("%s barks woof!\n", itt->value); ith = get_string_hash(&h, "cat"); if (!ith) printf("Cat not found.\n"); else printf("%s say meeow!\n", ith->value); free_string_tree(&t); free_string_hash(&h); return 0; }
int main_PolyLex(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } int language=-1; char alphabet[FILENAME_MAX]=""; char name_bin[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; char info[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_PolyLex,lopts_PolyLex,&index))) { switch(val) { case 'D': language=DUTCH; break; case 'G': language=GERMAN; break; case 'N': language=NORWEGIAN; break; case 'R': language=RUSSIAN; break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'd': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty dictionary file name\n"); return USAGE_ERROR_CODE; } strcpy(name_bin,options.vars()->optarg); break; case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output file name\n"); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'i': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty information file name\n"); return USAGE_ERROR_CODE; } strcpy(info,options.vars()->optarg); break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_PolyLex[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (name_bin[0]=='\0') { error("You must specify the .bin dictionary to use\n"); return USAGE_ERROR_CODE; } if (output[0]=='\0') { error("You must specify the output dictionary file name\n"); return USAGE_ERROR_CODE; } if (language==-1) { error("You must specify the language\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } Alphabet* alph=NULL; if (alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); alph=load_alphabet(&vec,alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); return USAGE_ERROR_CODE; } } char name_inf[FILENAME_MAX]; struct string_hash* forbiddenWords=NULL; if (language==DUTCH || language==NORWEGIAN) { get_path(name_bin,name_inf); strcat(name_inf,"ForbiddenWords.txt"); forbiddenWords=load_key_list(&vec,name_inf); if (forbiddenWords==NULL) { /* If there was no file, we don't want to block the process */ forbiddenWords=new_string_hash(DONT_USE_VALUES); } } strcpy(name_inf,name_bin); name_inf[strlen(name_bin)-3]='\0'; strcat(name_inf,"inf"); Dictionary* d=new_Dictionary(&vec,name_bin,name_inf); if (d==NULL) { error("Cannot load dictionary %s\n",name_bin); free_string_hash(forbiddenWords); free_alphabet(alph); return DEFAULT_ERROR_CODE; } char tmp[FILENAME_MAX]; strcpy(tmp,argv[options.vars()->optind]); strcat(tmp,".tmp"); U_FILE* words=u_fopen(&vec,argv[options.vars()->optind],U_READ); if (words==NULL) { error("Cannot open word list file %s\n",argv[options.vars()->optind]); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); // here we return 0 in order to do not block the preprocessing // in the Unitex/GramLab IDE interface, if no dictionary was applied // so that there is no "err" file return SUCCESS_RETURN_CODE; } U_FILE* new_unknown_words=u_fopen(&vec,tmp,U_WRITE); if (new_unknown_words==NULL) { error("Cannot open temporary word list file %s\n",tmp); u_fclose(words); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); return DEFAULT_ERROR_CODE; } U_FILE* res=u_fopen(&vec,output,U_APPEND); if (res==NULL) { error("Cannot open result file %s\n",output); u_fclose(new_unknown_words); u_fclose(words); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); u_fclose(words); return DEFAULT_ERROR_CODE; } U_FILE* debug=NULL; if ((*info)!='\0') { debug=u_fopen(&vec,info,U_WRITE); if (debug==NULL) { error("Cannot open debug file %s\n",info); } } struct utags UTAG; switch(language) { case DUTCH: analyse_dutch_unknown_words(alph, d, words, res, debug, new_unknown_words, forbiddenWords); break; case GERMAN: analyse_german_compounds(alph, d, words, res, debug, new_unknown_words); break; case NORWEGIAN: analyse_norwegian_unknown_words(alph, d, words, res, debug, new_unknown_words, forbiddenWords); break; case RUSSIAN: init_russian(&UTAG); analyse_compounds(alph, d, words, res, debug, new_unknown_words, UTAG); break; } free_alphabet(alph); free_Dictionary(d); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); af_remove(argv[options.vars()->optind]); af_rename(tmp,argv[options.vars()->optind]); u_fclose(res); if (debug!=NULL) { u_fclose(debug); } return SUCCESS_RETURN_CODE; }
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err, const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy, Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input, const char* dynamicDir,TokenizationPolicy tokenization_policy, SpacePolicy space_policy,int search_limit,const char* morpho_dic_list, AmbiguousOutputPolicy ambiguous_output_policy, VariableErrorPolicy variable_error_policy,int protect_dic_chars, int is_korean,int max_count_call,int max_count_call_warning, char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) { U_FILE* out; U_FILE* info; struct locate_parameters* p=new_locate_parameters(); p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0); p->buffer=(int*)af_get_mapfile_pointer(p->text_cod); long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int); p->buffer_size=(int)text_size; p->tilde_negation_operator=tilde_negation_operator; p->useLocateCache=useLocateCache; if (max_count_call == -1) { max_count_call = (int)text_size; } if (max_count_call_warning == -1) { max_count_call_warning = (int)text_size; } p->match_policy=match_policy; p->tokenization_policy=tokenization_policy; p->space_policy=space_policy; p->output_policy=output_policy; p->search_limit=search_limit; p->ambiguous_output_policy=ambiguous_output_policy; p->variable_error_policy=variable_error_policy; p->protect_dic_chars=protect_dic_chars; p->mask_encoding_compatibility_input = mask_encoding_compatibility_input; p->max_count_call = max_count_call; p->max_count_call_warning = max_count_call_warning; p->token_filename = tokens; char concord[FILENAME_MAX]; char concord_info[FILENAME_MAX]; strcpy(concord,dynamicDir); strcat(concord,"concord.ind"); strcpy(concord_info,dynamicDir); strcat(concord_info,"concord.n"); char morpho_bin[FILENAME_MAX]; strcpy(morpho_bin,dynamicDir); strcat(morpho_bin,"morpho.bin"); if (arabic_rules!=NULL && arabic_rules[0]!='\0') { load_arabic_typo_rules(arabic_rules,&(p->arabic)); } out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE); if (out==NULL) { error("Cannot write %s\n",concord); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); u_fclose(out); return 0; } info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE); if (info==NULL) { error("Cannot write %s\n",concord_info); } switch(output_policy) { case IGNORE_OUTPUTS: u_fprintf(out,"#I\n"); break; case MERGE_OUTPUTS: u_fprintf(out,"#M\n"); break; case REPLACE_OUTPUTS: u_fprintf(out,"#R\n"); break; } if (alphabet!=NULL && alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); p->alphabet=load_alphabet(alphabet,is_korean); if (p->alphabet==NULL) { error("Cannot load alphabet file %s\n",alphabet); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } } struct string_hash* semantic_codes=new_string_hash(); extract_semantic_codes(dlf,semantic_codes); extract_semantic_codes(dlc,semantic_codes); if (is_cancelling_requested() != 0) { error("user cancel request.\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } u_printf("Loading fst2...\n"); struct FST2_free_info fst2load_free; Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free); if (fst2load==NULL) { error("Cannot load grammar %s\n",fst2_name); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered); p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator); free_abstract_Fst2(fst2load,&fst2load_free); if (is_cancelling_requested() != 0) { error("User cancel request..\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } p->tags=p->fst2->tags; #ifdef TRE_WCHAR p->filters=new_FilterSet(p->fst2,p->alphabet); if (p->filters==NULL) { error("Cannot compile filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_stack_unichar(p->stack); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif u_printf("Loading token list...\n"); int n_text_tokens=0; p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens); if (p->tokens==NULL) { error("Cannot load token list %s\n",tokens); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator; p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator); memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache)); if (p->match_cache==NULL) { fatal_alloc_error("locate_pattern"); } #ifdef TRE_WCHAR p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens); if (p->filter_match_index==NULL) { error("Cannot optimize filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_string_hash(p->tokens); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif if (allow_trace!=0) { open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace); } extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator); u_printf("Loading morphological dictionaries...\n"); load_morphological_dictionaries(morpho_dic_list,p,morpho_bin); extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator); p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char)); if (p->token_control==NULL) { fatal_alloc_error("locate_pattern"); } p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*)); if (p->matching_patterns==NULL) { fatal_alloc_error("locate_pattern"); } for (int i=0; i<n_text_tokens; i++) { p->token_control[i]=0; p->matching_patterns[i]=NULL; } compute_token_controls(p->alphabet,err,p); int number_of_patterns,is_DIC,is_CDIC,is_SDIC; p->pattern_tree_root=new_pattern_node(locate_abstract_allocator); u_printf("Computing fst2 tags...\n"); process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator); p->current_compound_pattern=number_of_patterns; p->DLC_tree=new_DLC_tree(p->tokens->size); struct lemma_node* root=new_lemma_node(); u_printf("Loading dlf...\n"); load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); u_printf("Loading dlc...\n"); load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); /* We look if tag tokens like "{today,.ADV}" verify some patterns */ check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator); u_printf("Optimizing fst2 pattern tags...\n"); optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator); u_printf("Optimizing compound word dictionary...\n"); optimize_DLC(p->DLC_tree); free_string_hash(semantic_codes); int nb_input_variable=0; p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable); p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables); Abstract_allocator locate_recycle_abstract_allocator=NULL; locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle", AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject, get_prefered_allocator_item_size_for_nb_variable(nb_input_variable)); u_printf("Optimizing fst2...\n"); p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator); if (is_korean) { p->korean=new Korean(p->alphabet); p->jamo_tags=create_jamo_tags(p->korean,p->tokens); } p->failfast=new_bit_array(n_text_tokens,ONE_BIT); u_printf("Working...\n"); p->prv_alloc=locate_work_abstract_allocator; p->prv_alloc_recycle=locate_recycle_abstract_allocator; launch_locate(out,text_size,info,p); if (allow_trace!=0) { close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace); } free_bit_array(p->failfast); free_Variables(p->input_variables); free_OutputVariables(p->output_variables); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); if (p->match_cache!=NULL) { for (int i=0; i<p->tokens->size; i++) { free_LocateCache(p->match_cache[i],locate_work_abstract_allocator); } free_cb(p->match_cache,locate_work_abstract_allocator); } int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1; if (free_abstract_allocator_item) { free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator); } free_stack_unichar(p->stack); /** Too long to free the DLC tree if it is big * free_DLC_tree(p->DLC_tree); */ if (free_abstract_allocator_item) { free_pattern_node(p->pattern_tree_root,locate_abstract_allocator); free_Fst2(p->fst2,locate_abstract_allocator); free_list_int(p->tag_token_list,locate_abstract_allocator); } close_abstract_allocator(locate_abstract_allocator); close_abstract_allocator(locate_recycle_abstract_allocator); locate_recycle_abstract_allocator=locate_abstract_allocator=NULL; /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */ free_alphabet(p->alphabet); if (p->korean!=NULL) { delete p->korean; } if (p->jamo_tags!=NULL) { /* jamo tags must be freed before tokens, because we need to know how * many jamo tags there are, and this number is the number of tokens */ for (int i=0; i<p->tokens->size; i++) { free(p->jamo_tags[i]); } free(p->jamo_tags); } free_string_hash(p->tokens); free_lemma_node(root); free(p->token_control); for (int i=0; i<n_text_tokens; i++) { free_bit_array(p->matching_patterns[i]); } free(p->matching_patterns); #ifdef TRE_WCHAR free_FilterSet(p->filters); free_FilterMatchIndex(p->filter_match_index); #endif for (int i=0; i<p->n_morpho_dics; i++) { free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i])); free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i])); } free(p->morpho_dic_inf); free(p->morpho_dic_inf_free); free(p->morpho_dic_bin); free(p->morpho_dic_bin_free); #if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT)) free_DLC_tree(p->DLC_tree); #endif free_locate_parameters(p); u_printf("Done.\n"); return 1; }
int main_CheckDic(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int is_a_DELAF=-1; int strict_unprotected=0; int skip_path=0; char alph[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; int space_warnings=1; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_CheckDic,lopts_CheckDic,&index,vars))) { switch(val) { case 'f': is_a_DELAF=1; break; case 's': is_a_DELAF=0; break; case 'h': usage(); return 0; case 'r': strict_unprotected=1; break; case 't': strict_unprotected=0; break; case 'n': space_warnings=0; break; case 'p': skip_path=1; break; case 'a': if (vars->optarg[0]=='\0') { fatal_error("Empty alphabet argument\n"); } strcpy(alph,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_CheckDic[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (is_a_DELAF==-1 || vars->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return 1; } U_FILE* dic=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (dic==NULL) { fatal_error("Cannot open dictionary %s\n",argv[vars->optind]); } Alphabet* alphabet0=NULL; if (alph[0]!='\0') { alphabet0=load_alphabet(alph,1); } char output_filename[FILENAME_MAX]; get_path(argv[vars->optind],output_filename); strcat(output_filename,"CHECK_DIC.TXT"); U_FILE* out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output_filename,U_WRITE); if (out==NULL) { u_fclose(dic); fatal_error("Cannot create %s\n",output_filename); } u_printf("Checking %s...\n",argv[vars->optind]); unichar line[CHECKDIC_LINE_SIZE]; int line_number=1; /* * We declare and initialize an array in order to know which * letters are used in the dictionary. */ int i; char* alphabet=(char*)malloc(sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS); if (alphabet==NULL) { fatal_alloc_error("CheckDic's main"); } memset(alphabet,0,sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS); /* * We use two structures for the storage of the codes found in the * dictionary. Note that 'semantic_codes' is used to store both grammatical and * semantic codes. */ struct string_hash* semantic_codes=new_string_hash(); struct string_hash* inflectional_codes=new_string_hash(); struct string_hash* simple_lemmas=new_string_hash(DONT_USE_VALUES); struct string_hash* compound_lemmas=new_string_hash(DONT_USE_VALUES); int n_simple_entries=0; int n_compound_entries=0; /* * We read all the lines and check them. */ while (EOF!=u_fgets_limit2(line,DIC_LINE_SIZE,dic)) { if (line[0]=='\0') { /* If we have an empty line, we print a unicode error message * into the output file */ u_fprintf(out,"Line %d: empty line\n",line_number); } else if (line[0]=='/') { /* If a line starts with '/', it is a commment line, so * we ignore it */ } else { /* If we have a line to check, we check it according to the * dictionary type */ check_DELA_line(line,out,is_a_DELAF,line_number,alphabet,semantic_codes, inflectional_codes,simple_lemmas,compound_lemmas, &n_simple_entries,&n_compound_entries,alphabet0,strict_unprotected); } /* At regular intervals, we display a message on the standard * output to show that the program is working */ if (line_number%10000==0) { u_printf("%d lines read...\r",line_number); } line_number++; } u_printf("%d lines read\n",line_number-1); u_fclose(dic); /* * Once we have checked all the lines, we print some informations * in the output file. */ u_fprintf(out,"-----------------------------------\n"); u_fprintf(out,"------------- Stats -------------\n"); u_fprintf(out,"-----------------------------------\n"); if (skip_path != 0) { char filename_without_path[FILENAME_MAX]; remove_path(argv[vars->optind],filename_without_path); u_fprintf(out,"File: %s\n",filename_without_path); } else { u_fprintf(out,"File: %s\n",argv[vars->optind]); } u_fprintf(out,"Type: %s\n",is_a_DELAF?"DELAF":"DELAS"); u_fprintf(out,"%d line%s read\n",line_number-1,(line_number-1>1)?"s":""); u_fprintf(out,"%d simple entr%s ",n_simple_entries,(n_simple_entries>1)?"ies":"y"); u_fprintf(out,"for %d distinct lemma%s\n",simple_lemmas->size,(simple_lemmas->size>1)?"s":""); u_fprintf(out,"%d compound entr%s ",n_compound_entries,(n_compound_entries>1)?"ies":"y"); u_fprintf(out,"for %d distinct lemma%s\n",compound_lemmas->size,(compound_lemmas->size>1)?"s":""); /** * We print the list of the characters that are used, with * their unicode numbers shown in hexadecimal. This can be useful * to detect different characters that are graphically identical * like 'A' (upper of latin 'a' or upper of greek alpha ?). */ u_fprintf(out,"-----------------------------------\n"); u_fprintf(out,"---- All chars used in forms ----\n"); u_fprintf(out,"-----------------------------------\n"); unichar r[4]; unichar r2[7]; r[1]=' '; r[2]='('; r[3]='\0'; r2[5]='\n'; r2[6]='\0'; for (i=0;i<MAX_NUMBER_OF_UNICODE_CHARS;i++) { if (alphabet[i]) { u_fprintf(out,"%C (%04X)\n",i,i); } } /* * Then we print the list of all grammatical and semantic codes used in the * dictionary. If a code contains a non ASCII character, a space or a tabulation, * we print a warning. */ u_fprintf(out,"-------------------------------------------------------------\n"); u_fprintf(out,"---- %3d grammatical/semantic code%s",semantic_codes->size,(semantic_codes->size>1)?"s used in dictionary ----\n":" used in dictionary -----\n"); u_fprintf(out,"-------------------------------------------------------------\n"); unichar comment[2000]; for (i=0;i<semantic_codes->size;i++) { /* We print the code, followed if necessary by a warning */ u_fprintf(out,"%S",semantic_codes->value[i]); if (warning_on_code(semantic_codes->value[i],comment,space_warnings)) { u_fprintf(out," %S",comment); } u_fprintf(out,"\n"); } /* * Finally, we print the list of inflectional codes, * with warnings in the case of non ASCII letters, spaces * or tabulations. */ u_fprintf(out,"-----------------------------------------------------\n"); u_fprintf(out,"---- %3d inflectional code%s",inflectional_codes->size,(inflectional_codes->size>1)?"s used in dictionary ----\n":" used in dictionary -----\n"); u_fprintf(out,"-----------------------------------------------------\n"); for (i=0;i<inflectional_codes->size;i++) { u_fprintf(out,"%S",inflectional_codes->value[i]); if (warning_on_code(inflectional_codes->value[i],comment,space_warnings)) { u_fprintf(out," %S",comment); } u_fprintf(out,"\n"); } u_fclose(out); free_OptVars(vars); u_printf("Done.\n"); /* Note that we don't free anything since it would only waste time */ free(alphabet); if (alphabet0!=NULL) { free_alphabet(alphabet0); } #if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT)) /* cleanup for no leak on library */ free_string_hash(semantic_codes); free_string_hash(inflectional_codes); free_string_hash(simple_lemmas); free_string_hash(compound_lemmas); #endif return 0; }
/** * The same than main, but no call to setBufferMode. */ int main_KeyWords(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } VersatileEncodingConfig vec=VEC_DEFAULT; char tokens[FILENAME_MAX]; char output[FILENAME_MAX]=""; char alph[FILENAME_MAX]=""; char cdic[FILENAME_MAX]=""; unichar* code=u_strdup("XXX"); int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_KeyWords,lopts_KeyWords,&index))) { switch(val) { case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output\n"); free(code); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); free(code); return USAGE_ERROR_CODE; } strcpy(alph,options.vars()->optarg); break; case 'f': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty forbidden code\n"); free(code); return USAGE_ERROR_CODE; } free(code); code=u_strdup(options.vars()->optarg); break; case 'c': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty file name\n"); free(code); return USAGE_ERROR_CODE; } strcpy(cdic,options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); free(code); return SUCCESS_RETURN_CODE; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); free(code); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); free(code); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_KeyWords[index].name); free(code); return USAGE_ERROR_CODE; break; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); free(code); return USAGE_ERROR_CODE; break; } index=-1; } if (options.vars()->optind==argc || options.vars()->optind==argc-1) { error("Invalid arguments: rerun with --help\n"); free(code); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory free(code); return SUCCESS_RETURN_CODE; } Alphabet* alphabet=NULL; if (alph[0]!='\0') { alphabet=load_alphabet(&vec,alph); if (alphabet==NULL) { error("Cannot load alphabet file %s\n",alph); free(code); return DEFAULT_ERROR_CODE; } } strcpy(tokens,argv[(options.vars()->optind++)]); if (output[0]=='\0') { get_path(tokens,output); strcat(output,"keywords.txt"); } struct string_hash_ptr* keywords=load_tokens_by_freq(tokens,&vec); filter_non_letter_keywords(keywords,alphabet); if (cdic[0]!='\0') { load_compound_words(cdic,&vec,keywords); } for (;options.vars()->optind!=argc;(options.vars()->optind)++) { filter_keywords_with_dic(keywords,argv[options.vars()->optind],&vec,alphabet); } merge_case_equivalent_unknown_words(keywords,alphabet); struct string_hash* forbidden_lemmas=compute_forbidden_lemmas(keywords,code); remove_keywords_with_forbidden_lemma(keywords,forbidden_lemmas); free_string_hash(forbidden_lemmas); vector_ptr* sorted=sort_keywords(keywords); U_FILE* f_output=u_fopen(&vec,output,U_WRITE); if (f_output==NULL) { error("Cannot write in file %s\n",output); free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list); free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list); free_alphabet(alphabet); free(code); return DEFAULT_ERROR_CODE; } dump_keywords(sorted,f_output); u_fclose(f_output); free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list); free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list); free_alphabet(alphabet); free(code); return SUCCESS_RETURN_CODE; }
int main_PolyLex(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int language=-1; char alphabet[FILENAME_MAX]=""; char dictionary[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; char info[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_PolyLex,lopts_PolyLex,&index,vars))) { switch(val) { case 'D': language=DUTCH; break; case 'G': language=GERMAN; break; case 'N': language=NORWEGIAN; break; case 'R': language=RUSSIAN; break; case 'a': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty alphabet file name\n"); } strcpy(alphabet,vars->optarg); break; case 'd': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty dictionary file name\n"); } strcpy(dictionary,vars->optarg); break; case 'o': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty output file name\n"); } strcpy(output,vars->optarg); break; case 'i': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty information file name\n"); } strcpy(info,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_PolyLex[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } if (dictionary[0]=='\0') { fatal_error("You must specify the .bin dictionary to use\n"); } if (output[0]=='\0') { fatal_error("You must specify the output dictionary file name\n"); } if (language==-1) { fatal_error("You must specify the language\n"); } Alphabet* alph=NULL; if (alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); alph=load_alphabet(alphabet); if (alph==NULL) { fatal_error("Cannot load alphabet file %s\n",alphabet); } } char temp[FILENAME_MAX]; struct string_hash* forbiddenWords=NULL; if (language==DUTCH || language==NORWEGIAN) { get_path(dictionary,temp); strcat(temp,"ForbiddenWords.txt"); forbiddenWords=load_key_list(temp,mask_encoding_compatibility_input); } u_printf("Loading BIN file...\n"); struct BIN_free_info bin_free; const unsigned char* bin=load_abstract_BIN_file(dictionary,&bin_free); if (bin==NULL) { error("Cannot load bin file %s\n",dictionary); free_alphabet(alph); free_string_hash(forbiddenWords); return 1; } strcpy(temp,dictionary); temp[strlen(dictionary)-3]='\0'; strcat(temp,"inf"); u_printf("Loading INF file...\n"); struct INF_free_info inf_free; const struct INF_codes* inf=load_abstract_INF_file(temp,&inf_free); if (inf==NULL) { error("Cannot load inf file %s\n",temp); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_string_hash(forbiddenWords); return 1; } char tmp[FILENAME_MAX]; strcpy(tmp,argv[vars->optind]); strcat(tmp,".tmp"); U_FILE* words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (words==NULL) { error("Cannot open word list file %s\n",argv[vars->optind]); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); free_string_hash(forbiddenWords); // here we return 0 in order to do not block the preprocessing // in the Unitex Java interface, if no dictionary was applied // so that there is no "err" file return 0; } U_FILE* new_unknown_words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,tmp,U_WRITE); if (new_unknown_words==NULL) { error("Cannot open temporary word list file %s\n",tmp); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); u_fclose(words); free_string_hash(forbiddenWords); return 1; } U_FILE* res=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output,U_APPEND); if (res==NULL) { error("Cannot open result file %s\n",output); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); return 1; } U_FILE* debug=NULL; if (info!=NULL) { debug=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,info,U_WRITE); if (debug==NULL) { error("Cannot open debug file %s\n",info); } } struct utags UTAG; switch(language) { case DUTCH: analyse_dutch_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break; case GERMAN: analyse_german_compounds(alph,bin,inf,words,res,debug,new_unknown_words); break; case NORWEGIAN: analyse_norwegian_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break; case RUSSIAN: init_russian(&UTAG); analyse_compounds(alph,bin,inf,words,res,debug,new_unknown_words,UTAG); break; } free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); af_remove(argv[vars->optind]); af_rename(tmp,argv[vars->optind]); u_fclose(res); if (debug!=NULL) { u_fclose(debug); } free_OptVars(vars); return 0; }
/** * This function produces a normalized version of 'input' and stores it into 'ouput'. * The following rules are applied in the given order: * * 1) If there is a { at the current position, we try to read a {S}, a {STOP} or * a tag token like {today,.ADV}. If we fail, we replace the { and the }, if any, * according to the replacement rules. Otherwise, we let the token unchanged. * 2) If there is one or more replacement rules that can apply to the current * position in 'input', then we apply the longest one. * 3) If we we find a separator (space, tab, new line) sequence, we replace it: * - by a new line if the sequence contains one and if 'carriage_return_policy' is * set to KEEP_CARRIAGE_RETURN; * - by a space otherwise. * 4) We copy the character that was read to the output. * * Note that 'replacements' is supposed to contain replacement rules for { and } */ int normalize(const char *fin, const char *fout, Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input, int carriage_return_policy, const char *rules) { U_FILE* input; input = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,fin,U_READ); if (input == NULL) { error("Cannot open file %s\n", fin); return 1; } U_FILE* output; output = u_fopen_creating_versatile_encoding(encoding_output,bom_output,fout,U_WRITE); if (output == NULL) { error("Cannot create file %s\n", fout); u_fclose(input); return 1; } struct string_hash* replacements=NULL; if(rules != NULL && rules[0]!='\0') { replacements=load_key_value_list(rules,mask_encoding_compatibility_input,'\t'); if (replacements==NULL) { error("Cannot load replacement rules file %s\n", rules); replacements=new_string_hash(); } } /* If there is no replacement rules file, we simulate one */ else { replacements=new_string_hash(); } /* If there is a replacement rule file, we ensure that there are replacement * rules for { and }. If not, we add our default ones, so that in any case, * we are sure to have rules for { and } */ unichar key[2]; unichar value[2]; u_strcpy(key,"{"); u_strcpy(value,"["); get_value_index(key,replacements,INSERT_IF_NEEDED,value); u_strcpy(key,"}"); u_strcpy(value,"]"); get_value_index(key,replacements,INSERT_IF_NEEDED,value); struct OUTBUF OutBuf; OutBuf.pos=0; unichar tmp[MAX_TAG_LENGTH]; //struct buffer* buffer=new_buffer_for_file(UNICHAR_BUFFER,input); long save_pos=ftell(input); fseek(input,0,SEEK_END); long file_size_input=ftell(input); fseek(input,save_pos,SEEK_SET); int line_buffer_size = (int)(((file_size_input+1) < MAX_LINE_BUFFER_SIZE) ? (file_size_input+1) : MAX_LINE_BUFFER_SIZE); unichar *line_read; line_read=(unichar*)malloc((line_buffer_size+0x10)*sizeof(unichar)); if (line_read==NULL) { fatal_alloc_error("normalize"); } /* We define some things that will be used for parsing the buffer */ static const unichar stop_chars[]= { '{', '}', 0 }; static const unichar forbidden_chars[]= { '\n', 0 }; static const unichar open_bracket[]= { '{', 0 }; static const unichar close_bracket[]= { '}', 0 }; static const unichar empty_string[]= { 0 }; int corrupted_file=0; int eof_found=0; /* First, we fill the buffer */ int lastline_was_terminated=0; while (eof_found==0) { int current_start_pos=0; int found_null=0; const unichar*buff=line_read; int result_read = 0; result_read = u_fgets_treat_cr_as_lf(line_read,line_buffer_size,input,1,&found_null); if ((found_null != 0) && (corrupted_file==0)) { corrupted_file=1; error("Corrupted text file containing NULL characters!\n"); error("They have been ignored by Normalize, but you should clean your text\n"); } if (result_read>0) if (line_read[result_read-1]==0x0d) line_read[result_read-1]='\n'; if (result_read==EOF) break; if (lastline_was_terminated != 0) while (current_start_pos<result_read) { if (buff[current_start_pos]!=' ' && buff[current_start_pos]!='\t' && buff[current_start_pos]!=0x0d && buff[current_start_pos]!='\n') break; current_start_pos++; } lastline_was_terminated = 0; if (result_read > 0) if ((buff[result_read-1]=='\n') || (buff[result_read-1]==0x0d)) lastline_was_terminated = 1; while (current_start_pos<result_read) { if ((lastline_was_terminated == 0) && (eof_found == 0) && (current_start_pos + MINIMAL_CHAR_IN_BUFFER_BEFORE_CONTINUE_LINE >= result_read)) { int i; int nb_to_keep = result_read-current_start_pos; for (i=0;i<nb_to_keep;i++) line_read[i]=line_read[current_start_pos+i]; int found_null_read=0; int result_read_continue = u_fgets_treat_cr_as_lf(line_read+nb_to_keep,line_buffer_size-nb_to_keep,input,1,&found_null_read); if ((found_null_read != 0) && (corrupted_file==0)) { corrupted_file=1; error("Corrupted text file containing NULL characters!\n"); error("They have been ignored by Normalize, but you should clean your text\n"); } if (result_read_continue>0) if (line_read[(result_read_continue+nb_to_keep)-1]==0x0d) line_read[(result_read_continue+nb_to_keep)-1]='\n'; lastline_was_terminated = 0; if (result_read_continue==EOF) eof_found = lastline_was_terminated = 1; if (result_read_continue > 0) if ((buff[(result_read_continue+nb_to_keep)-1]=='\n') || (buff[(result_read_continue+nb_to_keep)-1]==0x0d)) lastline_was_terminated = 1; result_read = nb_to_keep; current_start_pos = 0; if (result_read_continue > 0) result_read += result_read_continue; } if (buff[current_start_pos]=='{') { /* If we have a {, we try to find a sequence like {....}, that does not contain * new lines. If the sequence contains protected character, we want to keep them * protected. */ int old_position=current_start_pos; /* If we don't increase the position, the parse will stop on the initial { */ current_start_pos++; tmp[0]='{'; int code=parse_string(buff,¤t_start_pos,&(tmp[1]),stop_chars,forbidden_chars,NULL); if (code==P_FORBIDDEN_CHAR || code==P_BACKSLASH_AT_END || buff[current_start_pos]!='}') { /* If we have found a new line or a {, or if there is * a backslash at the end of the buffer, or if we have reached the end * of the buffer, we assume that the initial * { was not a tag beginning, so we print the substitute of { */ WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0); /* And we rewind the current position after the { */ current_start_pos=old_position+1; } else { /* If we have read a sequence like {....}, we assume that there won't be * a buffer overflow if we add the } */ u_strcat(tmp,close_bracket); if (!u_strcmp(tmp,"{S}") || !u_strcmp(tmp,"{STOP}") || check_tag_token(tmp)) { /* If this is a special tag or a valid tag token, we just print * it to the output */ WriteOufBuf(&OutBuf,tmp,output, 0); current_start_pos++; } else { /* If we have a non valid tag token, we print the equivalent of { * and we rewind the current position after the { */ WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0); current_start_pos=old_position+1; } } } else { /* If we have a character that is not {, first we try to look if there * is a replacement to do */ int key_length; int index=get_longest_key_index(&buff[current_start_pos],&key_length,replacements); if (index!=NO_VALUE_INDEX) { /* If there is something to replace */ WriteOufBuf(&OutBuf,replacements->value[index],output, 0); current_start_pos=current_start_pos+key_length; } else { if (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { /* If we have a separator, we try to read the longest separator sequence * that we can read. By the way, we note if it contains a new line */ int new_line=0; while (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { /* Note 1: no bound check is needed, since an unichar buffer is always * ended by a \0 * * Note 2: we don't take into account the case of a buffer ended by * separator while it's not the end of file: that would mean * that the text contains something like MARGIN_BEFORE_BUFFER_END * contiguous separators. Such a text would not be a reasonable one. */ if (buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { new_line=1; } current_start_pos++; } if (new_line && (carriage_return_policy==KEEP_CARRIAGE_RETURN)) { /* We print a new line if the sequence contains one and if we are * allowed to; otherwise, we print a space. */ WriteOufBuf(&OutBuf,'\n',output, 0); } else { WriteOufBuf(&OutBuf,' ',output, 0); } } else { /* If, finally, we have a normal character to normalize, we just print it */ WriteOufBuf(&OutBuf,buff[current_start_pos++],output, 0); } } } } } WriteOufBuf(&OutBuf,empty_string,output, 1); free(line_read); free_string_hash(replacements); u_fclose(input); u_fclose(output); return 0; }