/** * Opens a .fst2 file in output mode and returns the associated fst_file_out_t * structure, or NULL in case of error. */ Elag_fst_file_out* fst_file_out_open(const VersatileEncodingConfig* vec,const char* fname,int type) { Elag_fst_file_out* res=(Elag_fst_file_out*)malloc(sizeof(Elag_fst_file_out)); if (res==NULL) { fatal_alloc_error("fst_file_out_open"); } if (type<0 || type>=FST_BAD_TYPE) { fatal_error("fst_file_out_open: bad FST_TYPE\n"); } if ((res->f=u_fopen(vec,fname,U_WRITE))==NULL) { error("fst_out_open: unable to open '%s'\n",fname); free(res); return NULL; } res->fstart=ftell(res->f); u_fprintf(res->f,"0000000000\n"); res->name=strdup(fname); if (res->name==NULL) { fatal_alloc_error("fst_file_out_open"); } res->type=type; res->nb_automata=0; res->labels=new_string_hash(16); /* We add <E> to the tags in order to be sure that this special tag will have #0 */ get_value_index(EPSILON,res->labels); return res; }
uint32_t need_hash(V string) { NewString *s = toNewString(string); if (s->hash == 0) { s->hash = new_string_hash(s->size, s->text); } return s->hash; }
/** * Loads the lines of a text file into a string_hash and returns it, or NULL * if the file can not be opened. We arbitrary fix the limit of a line to 4096 * characters. Each line is splitted into a key and a value, according to a * given separator character. An error message will be printed if a line does not * contain the separator character, if an empty line is found, or if a line contains * an empty key. In case of empty values, the empty string will be used. * Note that keys and values can contain characters protected with the \ character, * including protected new lines like: * * 123\ * =ONE_TWO_THREE_NEW_LINE * */ struct string_hash* load_key_value_list(const char* name,int mask_encoding_compatibility_input,unichar separator) { U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,name,U_READ); if (f==NULL) return NULL; struct string_hash* hash=new_string_hash(); unichar temp[4096]; unichar key[4096]; unichar value[4096]; /* We build a string with the separator character */ unichar stop[2]; stop[0]=separator; stop[1]='\0'; int code; while (EOF!=(code=u_fgets2(temp,f))) { if (code==0) { error("Empty line\n"); } else { /* First, we try to read a non empty key */ int pos=0; code=parse_string(temp,&pos,key,stop); if (code==P_BACKSLASH_AT_END) { error("Backslash at end of line:<%S>\n\n",temp); } else if (pos==0 &&temp[pos]=='\0') { /* Empty line */ continue; } else if (pos==0) { /* If the line starts with the separator */ error("Line with empty key:\n<%S>\n",temp); } else { /* We jump over the separator */ pos++; /* We initialize 'value' with the empty string in case it is not * defined in the file */ value[0]='\0'; if(P_BACKSLASH_AT_END==parse_string(temp,&pos,value,P_EMPTY)) { error("Backslash at end of line:\n<%S>\n",temp); } else { /* If we have a valid (key,value) pair, we insert it into the string_hash */ get_value_index(key,hash,INSERT_IF_NEEDED,value); } } } } u_fclose(f); return hash; }
/** * Allocates, initializes and returns a new DELA tree. */ struct DELA_tree* new_DELA_tree() { struct DELA_tree* tree; tree=(struct DELA_tree*)malloc(sizeof(struct DELA_tree)); if (tree==NULL) { fatal_alloc_error("new_DELA_tree"); } tree->inflected_forms=new_string_hash(DONT_USE_VALUES); tree->size=0; tree->capacity=256; tree->dela_entries=(struct dela_entry_list**)malloc(tree->capacity*sizeof(struct dela_entry_list*)); if (tree->dela_entries==NULL) { fatal_alloc_error("new_DELA_tree"); } return tree; }
/** * Loads the lines of a text file info a string_hash and returns it, or NULL * if the file can not be opened. We arbitrary fix the limit of a line to 4096 * characters. For each line, we ignore the carriage return, if any, and we use * the remaining string as key and value. An error message will be printed if * an empty line is found. */ struct string_hash* load_key_list(const char* name,int mask_encoding_compatibility_input) { U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,name,U_READ); if (f==NULL) return NULL; struct string_hash* hash=new_string_hash(DONT_USE_VALUES); unichar temp[4096]; while (EOF!=u_fgets_limit2(temp,4096,f)) { if (temp[0]=='\0') { error("Empty line in %s\n",name); } else { get_value_index(temp,hash); } } u_fclose(f); return hash; }
/** * We remove every keyword that is tagged with the forbidden code. If * a forbidden keyword has several tags, all of them are removed: * * the,.DET + the,.XXX => all 'the' keywords are removed */ struct string_hash* compute_forbidden_lemmas(struct string_hash_ptr* keywords,unichar* code) { struct string_hash* hash=new_string_hash(DONT_USE_VALUES,DONT_ENLARGE); Ustring* tmp=new_Ustring(); for (int i=0;i<keywords->size;i++) { KeyWord* list=(KeyWord*)(keywords->value[i]); while (list!=NULL) { if (get_forbidden_keyword(list,code,tmp)) { get_value_index(tmp->str,hash); } list=list->next; } } free_Ustring(tmp); return hash; }
/** * Returns a new string_hash_ptr object with the given capacity. * All such objects use values. If not, the normal string_hash should be * used. The 'value' array will be enlarged if needed. */ struct string_hash_ptr* new_string_hash_ptr(int capacity) { struct string_hash_ptr* s=(struct string_hash_ptr*)malloc(sizeof(struct string_hash_ptr)); if (s==NULL) { fatal_alloc_error("new_string_hash_ptr"); } /* We don't use the unichar* values of the normal string hash */ s->hash=new_string_hash(DONT_USE_VALUES); s->capacity=capacity; s->size=0; s->value=(void**)malloc(capacity*sizeof(void*)); if (s->value==NULL) { fatal_alloc_error("new_string_hash_ptr"); } return s; }
/** * Allocates, initializes and returns a new token tree. */ struct fst2txt_token_tree* new_fst2txt_token_tree(Abstract_allocator prv_alloc) { struct fst2txt_token_tree* t=(struct fst2txt_token_tree*)malloc_cb(sizeof(struct fst2txt_token_tree),prv_alloc); if (t==NULL) { fatal_alloc_error("new_fst2txt_token_tree"); } t->hash=new_string_hash(DONT_USE_VALUES); /* We set a small default capacity since there will be one structure of * this kind for each state of the fst2 */ t->capacity=2; t->size=0; t->transition_array=(Transition**)malloc_cb(t->capacity*sizeof(Transition*), prv_alloc); if (t->transition_array==NULL) { fatal_alloc_error("new_fst2txt_token_tree"); } return t; }
/** * This function constructs and returns a token tree from a normalization grammar. * Tokens are represented by integers. */ struct normalization_tree* load_normalization_fst2(const VersatileEncodingConfig* vec,const char* grammar, const Alphabet* alph,struct text_tokens* tok) { struct FST2_free_info fst2_free; Fst2* fst2=load_abstract_fst2(vec,grammar,0,&fst2_free); if (fst2==NULL) { return NULL; } struct string_hash* hash=new_string_hash(DONT_USE_VALUES); /* We create the token tree to speed up the consultation */ for (int i=0;i<tok->N;i++) { get_value_index(tok->token[i],hash); } struct normalization_tree* root=new_normalization_tree(); explore_normalization_fst2(fst2,fst2->initial_states[1],root,hash,U_EMPTY,alph,NULL); free_abstract_Fst2(fst2,&fst2_free); free_string_hash(hash); return root; }
int main(void) { string_tree t; string_hash h; it_string_tree itt; it_string_hash ith; if (!new_string_tree(&t)) { fprintf(stderr, "Error allocating tree.\n"); exit(1); } if (!new_string_hash(&h)) { fprintf(stderr, "Error allocating hash.\n"); exit(1); } insert_string_tree(&t, "cat"); insert_string_tree(&t, "dog"); insert_string_tree(&t, "mouse"); insert_string_hash(&h, "cat"); insert_string_hash(&h, "dog"); insert_string_hash(&h, "mouse"); itt = get_string_tree(&t, "dog"); if (!itt) printf("Dog not found.\n"); else printf("%s barks woof!\n", itt->value); ith = get_string_hash(&h, "cat"); if (!ith) printf("Cat not found.\n"); else printf("%s say meeow!\n", ith->value); free_string_tree(&t); free_string_hash(&h); return 0; }
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err, const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy, Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input, const char* dynamicDir,TokenizationPolicy tokenization_policy, SpacePolicy space_policy,int search_limit,const char* morpho_dic_list, AmbiguousOutputPolicy ambiguous_output_policy, VariableErrorPolicy variable_error_policy,int protect_dic_chars, int is_korean,int max_count_call,int max_count_call_warning, char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) { U_FILE* out; U_FILE* info; struct locate_parameters* p=new_locate_parameters(); p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0); p->buffer=(int*)af_get_mapfile_pointer(p->text_cod); long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int); p->buffer_size=(int)text_size; p->tilde_negation_operator=tilde_negation_operator; p->useLocateCache=useLocateCache; if (max_count_call == -1) { max_count_call = (int)text_size; } if (max_count_call_warning == -1) { max_count_call_warning = (int)text_size; } p->match_policy=match_policy; p->tokenization_policy=tokenization_policy; p->space_policy=space_policy; p->output_policy=output_policy; p->search_limit=search_limit; p->ambiguous_output_policy=ambiguous_output_policy; p->variable_error_policy=variable_error_policy; p->protect_dic_chars=protect_dic_chars; p->mask_encoding_compatibility_input = mask_encoding_compatibility_input; p->max_count_call = max_count_call; p->max_count_call_warning = max_count_call_warning; p->token_filename = tokens; char concord[FILENAME_MAX]; char concord_info[FILENAME_MAX]; strcpy(concord,dynamicDir); strcat(concord,"concord.ind"); strcpy(concord_info,dynamicDir); strcat(concord_info,"concord.n"); char morpho_bin[FILENAME_MAX]; strcpy(morpho_bin,dynamicDir); strcat(morpho_bin,"morpho.bin"); if (arabic_rules!=NULL && arabic_rules[0]!='\0') { load_arabic_typo_rules(arabic_rules,&(p->arabic)); } out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE); if (out==NULL) { error("Cannot write %s\n",concord); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); u_fclose(out); return 0; } info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE); if (info==NULL) { error("Cannot write %s\n",concord_info); } switch(output_policy) { case IGNORE_OUTPUTS: u_fprintf(out,"#I\n"); break; case MERGE_OUTPUTS: u_fprintf(out,"#M\n"); break; case REPLACE_OUTPUTS: u_fprintf(out,"#R\n"); break; } if (alphabet!=NULL && alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); p->alphabet=load_alphabet(alphabet,is_korean); if (p->alphabet==NULL) { error("Cannot load alphabet file %s\n",alphabet); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } } struct string_hash* semantic_codes=new_string_hash(); extract_semantic_codes(dlf,semantic_codes); extract_semantic_codes(dlc,semantic_codes); if (is_cancelling_requested() != 0) { error("user cancel request.\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } u_printf("Loading fst2...\n"); struct FST2_free_info fst2load_free; Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free); if (fst2load==NULL) { error("Cannot load grammar %s\n",fst2_name); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered); p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator); free_abstract_Fst2(fst2load,&fst2load_free); if (is_cancelling_requested() != 0) { error("User cancel request..\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } p->tags=p->fst2->tags; #ifdef TRE_WCHAR p->filters=new_FilterSet(p->fst2,p->alphabet); if (p->filters==NULL) { error("Cannot compile filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_stack_unichar(p->stack); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif u_printf("Loading token list...\n"); int n_text_tokens=0; p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens); if (p->tokens==NULL) { error("Cannot load token list %s\n",tokens); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator; p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator); memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache)); if (p->match_cache==NULL) { fatal_alloc_error("locate_pattern"); } #ifdef TRE_WCHAR p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens); if (p->filter_match_index==NULL) { error("Cannot optimize filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_string_hash(p->tokens); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif if (allow_trace!=0) { open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace); } extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator); u_printf("Loading morphological dictionaries...\n"); load_morphological_dictionaries(morpho_dic_list,p,morpho_bin); extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator); p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char)); if (p->token_control==NULL) { fatal_alloc_error("locate_pattern"); } p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*)); if (p->matching_patterns==NULL) { fatal_alloc_error("locate_pattern"); } for (int i=0; i<n_text_tokens; i++) { p->token_control[i]=0; p->matching_patterns[i]=NULL; } compute_token_controls(p->alphabet,err,p); int number_of_patterns,is_DIC,is_CDIC,is_SDIC; p->pattern_tree_root=new_pattern_node(locate_abstract_allocator); u_printf("Computing fst2 tags...\n"); process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator); p->current_compound_pattern=number_of_patterns; p->DLC_tree=new_DLC_tree(p->tokens->size); struct lemma_node* root=new_lemma_node(); u_printf("Loading dlf...\n"); load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); u_printf("Loading dlc...\n"); load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); /* We look if tag tokens like "{today,.ADV}" verify some patterns */ check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator); u_printf("Optimizing fst2 pattern tags...\n"); optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator); u_printf("Optimizing compound word dictionary...\n"); optimize_DLC(p->DLC_tree); free_string_hash(semantic_codes); int nb_input_variable=0; p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable); p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables); Abstract_allocator locate_recycle_abstract_allocator=NULL; locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle", AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject, get_prefered_allocator_item_size_for_nb_variable(nb_input_variable)); u_printf("Optimizing fst2...\n"); p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator); if (is_korean) { p->korean=new Korean(p->alphabet); p->jamo_tags=create_jamo_tags(p->korean,p->tokens); } p->failfast=new_bit_array(n_text_tokens,ONE_BIT); u_printf("Working...\n"); p->prv_alloc=locate_work_abstract_allocator; p->prv_alloc_recycle=locate_recycle_abstract_allocator; launch_locate(out,text_size,info,p); if (allow_trace!=0) { close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace); } free_bit_array(p->failfast); free_Variables(p->input_variables); free_OutputVariables(p->output_variables); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); if (p->match_cache!=NULL) { for (int i=0; i<p->tokens->size; i++) { free_LocateCache(p->match_cache[i],locate_work_abstract_allocator); } free_cb(p->match_cache,locate_work_abstract_allocator); } int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1; if (free_abstract_allocator_item) { free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator); } free_stack_unichar(p->stack); /** Too long to free the DLC tree if it is big * free_DLC_tree(p->DLC_tree); */ if (free_abstract_allocator_item) { free_pattern_node(p->pattern_tree_root,locate_abstract_allocator); free_Fst2(p->fst2,locate_abstract_allocator); free_list_int(p->tag_token_list,locate_abstract_allocator); } close_abstract_allocator(locate_abstract_allocator); close_abstract_allocator(locate_recycle_abstract_allocator); locate_recycle_abstract_allocator=locate_abstract_allocator=NULL; /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */ free_alphabet(p->alphabet); if (p->korean!=NULL) { delete p->korean; } if (p->jamo_tags!=NULL) { /* jamo tags must be freed before tokens, because we need to know how * many jamo tags there are, and this number is the number of tokens */ for (int i=0; i<p->tokens->size; i++) { free(p->jamo_tags[i]); } free(p->jamo_tags); } free_string_hash(p->tokens); free_lemma_node(root); free(p->token_control); for (int i=0; i<n_text_tokens; i++) { free_bit_array(p->matching_patterns[i]); } free(p->matching_patterns); #ifdef TRE_WCHAR free_FilterSet(p->filters); free_FilterMatchIndex(p->filter_match_index); #endif for (int i=0; i<p->n_morpho_dics; i++) { free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i])); free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i])); } free(p->morpho_dic_inf); free(p->morpho_dic_inf_free); free(p->morpho_dic_bin); free(p->morpho_dic_bin_free); #if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT)) free_DLC_tree(p->DLC_tree); #endif free_locate_parameters(p); u_printf("Done.\n"); return 1; }
int main_CheckDic(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int is_a_DELAF=-1; int strict_unprotected=0; int skip_path=0; char alph[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; int space_warnings=1; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_CheckDic,lopts_CheckDic,&index,vars))) { switch(val) { case 'f': is_a_DELAF=1; break; case 's': is_a_DELAF=0; break; case 'h': usage(); return 0; case 'r': strict_unprotected=1; break; case 't': strict_unprotected=0; break; case 'n': space_warnings=0; break; case 'p': skip_path=1; break; case 'a': if (vars->optarg[0]=='\0') { fatal_error("Empty alphabet argument\n"); } strcpy(alph,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_CheckDic[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (is_a_DELAF==-1 || vars->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return 1; } U_FILE* dic=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (dic==NULL) { fatal_error("Cannot open dictionary %s\n",argv[vars->optind]); } Alphabet* alphabet0=NULL; if (alph[0]!='\0') { alphabet0=load_alphabet(alph,1); } char output_filename[FILENAME_MAX]; get_path(argv[vars->optind],output_filename); strcat(output_filename,"CHECK_DIC.TXT"); U_FILE* out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output_filename,U_WRITE); if (out==NULL) { u_fclose(dic); fatal_error("Cannot create %s\n",output_filename); } u_printf("Checking %s...\n",argv[vars->optind]); unichar line[CHECKDIC_LINE_SIZE]; int line_number=1; /* * We declare and initialize an array in order to know which * letters are used in the dictionary. */ int i; char* alphabet=(char*)malloc(sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS); if (alphabet==NULL) { fatal_alloc_error("CheckDic's main"); } memset(alphabet,0,sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS); /* * We use two structures for the storage of the codes found in the * dictionary. Note that 'semantic_codes' is used to store both grammatical and * semantic codes. */ struct string_hash* semantic_codes=new_string_hash(); struct string_hash* inflectional_codes=new_string_hash(); struct string_hash* simple_lemmas=new_string_hash(DONT_USE_VALUES); struct string_hash* compound_lemmas=new_string_hash(DONT_USE_VALUES); int n_simple_entries=0; int n_compound_entries=0; /* * We read all the lines and check them. */ while (EOF!=u_fgets_limit2(line,DIC_LINE_SIZE,dic)) { if (line[0]=='\0') { /* If we have an empty line, we print a unicode error message * into the output file */ u_fprintf(out,"Line %d: empty line\n",line_number); } else if (line[0]=='/') { /* If a line starts with '/', it is a commment line, so * we ignore it */ } else { /* If we have a line to check, we check it according to the * dictionary type */ check_DELA_line(line,out,is_a_DELAF,line_number,alphabet,semantic_codes, inflectional_codes,simple_lemmas,compound_lemmas, &n_simple_entries,&n_compound_entries,alphabet0,strict_unprotected); } /* At regular intervals, we display a message on the standard * output to show that the program is working */ if (line_number%10000==0) { u_printf("%d lines read...\r",line_number); } line_number++; } u_printf("%d lines read\n",line_number-1); u_fclose(dic); /* * Once we have checked all the lines, we print some informations * in the output file. */ u_fprintf(out,"-----------------------------------\n"); u_fprintf(out,"------------- Stats -------------\n"); u_fprintf(out,"-----------------------------------\n"); if (skip_path != 0) { char filename_without_path[FILENAME_MAX]; remove_path(argv[vars->optind],filename_without_path); u_fprintf(out,"File: %s\n",filename_without_path); } else { u_fprintf(out,"File: %s\n",argv[vars->optind]); } u_fprintf(out,"Type: %s\n",is_a_DELAF?"DELAF":"DELAS"); u_fprintf(out,"%d line%s read\n",line_number-1,(line_number-1>1)?"s":""); u_fprintf(out,"%d simple entr%s ",n_simple_entries,(n_simple_entries>1)?"ies":"y"); u_fprintf(out,"for %d distinct lemma%s\n",simple_lemmas->size,(simple_lemmas->size>1)?"s":""); u_fprintf(out,"%d compound entr%s ",n_compound_entries,(n_compound_entries>1)?"ies":"y"); u_fprintf(out,"for %d distinct lemma%s\n",compound_lemmas->size,(compound_lemmas->size>1)?"s":""); /** * We print the list of the characters that are used, with * their unicode numbers shown in hexadecimal. This can be useful * to detect different characters that are graphically identical * like 'A' (upper of latin 'a' or upper of greek alpha ?). */ u_fprintf(out,"-----------------------------------\n"); u_fprintf(out,"---- All chars used in forms ----\n"); u_fprintf(out,"-----------------------------------\n"); unichar r[4]; unichar r2[7]; r[1]=' '; r[2]='('; r[3]='\0'; r2[5]='\n'; r2[6]='\0'; for (i=0;i<MAX_NUMBER_OF_UNICODE_CHARS;i++) { if (alphabet[i]) { u_fprintf(out,"%C (%04X)\n",i,i); } } /* * Then we print the list of all grammatical and semantic codes used in the * dictionary. If a code contains a non ASCII character, a space or a tabulation, * we print a warning. */ u_fprintf(out,"-------------------------------------------------------------\n"); u_fprintf(out,"---- %3d grammatical/semantic code%s",semantic_codes->size,(semantic_codes->size>1)?"s used in dictionary ----\n":" used in dictionary -----\n"); u_fprintf(out,"-------------------------------------------------------------\n"); unichar comment[2000]; for (i=0;i<semantic_codes->size;i++) { /* We print the code, followed if necessary by a warning */ u_fprintf(out,"%S",semantic_codes->value[i]); if (warning_on_code(semantic_codes->value[i],comment,space_warnings)) { u_fprintf(out," %S",comment); } u_fprintf(out,"\n"); } /* * Finally, we print the list of inflectional codes, * with warnings in the case of non ASCII letters, spaces * or tabulations. */ u_fprintf(out,"-----------------------------------------------------\n"); u_fprintf(out,"---- %3d inflectional code%s",inflectional_codes->size,(inflectional_codes->size>1)?"s used in dictionary ----\n":" used in dictionary -----\n"); u_fprintf(out,"-----------------------------------------------------\n"); for (i=0;i<inflectional_codes->size;i++) { u_fprintf(out,"%S",inflectional_codes->value[i]); if (warning_on_code(inflectional_codes->value[i],comment,space_warnings)) { u_fprintf(out," %S",comment); } u_fprintf(out,"\n"); } u_fclose(out); free_OptVars(vars); u_printf("Done.\n"); /* Note that we don't free anything since it would only waste time */ free(alphabet); if (alphabet0!=NULL) { free_alphabet(alphabet0); } #if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT)) /* cleanup for no leak on library */ free_string_hash(semantic_codes); free_string_hash(inflectional_codes); free_string_hash(simple_lemmas); free_string_hash(compound_lemmas); #endif return 0; }
/** * Returns a new string_hash object with the default capacity. * Its bound policy will be to enlarge the 'value' array if needed. */ struct string_hash* new_string_hash() { return new_string_hash(DEFAULT_STRING_HASH_SIZE,ENLARGE_IF_NEEDED); }
/** * Returns a new string_hash object with the given capacity. Its * bound policy will be to enlarge the 'value' array if needed. */ struct string_hash* new_string_hash(int capacity) { return new_string_hash(capacity,ENLARGE_IF_NEEDED); }
/** * This function produces a normalized version of 'input' and stores it into 'ouput'. * The following rules are applied in the given order: * * 1) If there is a { at the current position, we try to read a {S}, a {STOP} or * a tag token like {today,.ADV}. If we fail, we replace the { and the }, if any, * according to the replacement rules. Otherwise, we let the token unchanged. * 2) If there is one or more replacement rules that can apply to the current * position in 'input', then we apply the longest one. * 3) If we we find a separator (space, tab, new line) sequence, we replace it: * - by a new line if the sequence contains one and if 'carriage_return_policy' is * set to KEEP_CARRIAGE_RETURN; * - by a space otherwise. * 4) We copy the character that was read to the output. * * Note that 'replacements' is supposed to contain replacement rules for { and } */ int normalize(const char *fin, const char *fout, Encoding encoding_output, int bom_output, int mask_encoding_compatibility_input, int carriage_return_policy, const char *rules) { U_FILE* input; input = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,fin,U_READ); if (input == NULL) { error("Cannot open file %s\n", fin); return 1; } U_FILE* output; output = u_fopen_creating_versatile_encoding(encoding_output,bom_output,fout,U_WRITE); if (output == NULL) { error("Cannot create file %s\n", fout); u_fclose(input); return 1; } struct string_hash* replacements=NULL; if(rules != NULL && rules[0]!='\0') { replacements=load_key_value_list(rules,mask_encoding_compatibility_input,'\t'); if (replacements==NULL) { error("Cannot load replacement rules file %s\n", rules); replacements=new_string_hash(); } } /* If there is no replacement rules file, we simulate one */ else { replacements=new_string_hash(); } /* If there is a replacement rule file, we ensure that there are replacement * rules for { and }. If not, we add our default ones, so that in any case, * we are sure to have rules for { and } */ unichar key[2]; unichar value[2]; u_strcpy(key,"{"); u_strcpy(value,"["); get_value_index(key,replacements,INSERT_IF_NEEDED,value); u_strcpy(key,"}"); u_strcpy(value,"]"); get_value_index(key,replacements,INSERT_IF_NEEDED,value); struct OUTBUF OutBuf; OutBuf.pos=0; unichar tmp[MAX_TAG_LENGTH]; //struct buffer* buffer=new_buffer_for_file(UNICHAR_BUFFER,input); long save_pos=ftell(input); fseek(input,0,SEEK_END); long file_size_input=ftell(input); fseek(input,save_pos,SEEK_SET); int line_buffer_size = (int)(((file_size_input+1) < MAX_LINE_BUFFER_SIZE) ? (file_size_input+1) : MAX_LINE_BUFFER_SIZE); unichar *line_read; line_read=(unichar*)malloc((line_buffer_size+0x10)*sizeof(unichar)); if (line_read==NULL) { fatal_alloc_error("normalize"); } /* We define some things that will be used for parsing the buffer */ static const unichar stop_chars[]= { '{', '}', 0 }; static const unichar forbidden_chars[]= { '\n', 0 }; static const unichar open_bracket[]= { '{', 0 }; static const unichar close_bracket[]= { '}', 0 }; static const unichar empty_string[]= { 0 }; int corrupted_file=0; int eof_found=0; /* First, we fill the buffer */ int lastline_was_terminated=0; while (eof_found==0) { int current_start_pos=0; int found_null=0; const unichar*buff=line_read; int result_read = 0; result_read = u_fgets_treat_cr_as_lf(line_read,line_buffer_size,input,1,&found_null); if ((found_null != 0) && (corrupted_file==0)) { corrupted_file=1; error("Corrupted text file containing NULL characters!\n"); error("They have been ignored by Normalize, but you should clean your text\n"); } if (result_read>0) if (line_read[result_read-1]==0x0d) line_read[result_read-1]='\n'; if (result_read==EOF) break; if (lastline_was_terminated != 0) while (current_start_pos<result_read) { if (buff[current_start_pos]!=' ' && buff[current_start_pos]!='\t' && buff[current_start_pos]!=0x0d && buff[current_start_pos]!='\n') break; current_start_pos++; } lastline_was_terminated = 0; if (result_read > 0) if ((buff[result_read-1]=='\n') || (buff[result_read-1]==0x0d)) lastline_was_terminated = 1; while (current_start_pos<result_read) { if ((lastline_was_terminated == 0) && (eof_found == 0) && (current_start_pos + MINIMAL_CHAR_IN_BUFFER_BEFORE_CONTINUE_LINE >= result_read)) { int i; int nb_to_keep = result_read-current_start_pos; for (i=0;i<nb_to_keep;i++) line_read[i]=line_read[current_start_pos+i]; int found_null_read=0; int result_read_continue = u_fgets_treat_cr_as_lf(line_read+nb_to_keep,line_buffer_size-nb_to_keep,input,1,&found_null_read); if ((found_null_read != 0) && (corrupted_file==0)) { corrupted_file=1; error("Corrupted text file containing NULL characters!\n"); error("They have been ignored by Normalize, but you should clean your text\n"); } if (result_read_continue>0) if (line_read[(result_read_continue+nb_to_keep)-1]==0x0d) line_read[(result_read_continue+nb_to_keep)-1]='\n'; lastline_was_terminated = 0; if (result_read_continue==EOF) eof_found = lastline_was_terminated = 1; if (result_read_continue > 0) if ((buff[(result_read_continue+nb_to_keep)-1]=='\n') || (buff[(result_read_continue+nb_to_keep)-1]==0x0d)) lastline_was_terminated = 1; result_read = nb_to_keep; current_start_pos = 0; if (result_read_continue > 0) result_read += result_read_continue; } if (buff[current_start_pos]=='{') { /* If we have a {, we try to find a sequence like {....}, that does not contain * new lines. If the sequence contains protected character, we want to keep them * protected. */ int old_position=current_start_pos; /* If we don't increase the position, the parse will stop on the initial { */ current_start_pos++; tmp[0]='{'; int code=parse_string(buff,¤t_start_pos,&(tmp[1]),stop_chars,forbidden_chars,NULL); if (code==P_FORBIDDEN_CHAR || code==P_BACKSLASH_AT_END || buff[current_start_pos]!='}') { /* If we have found a new line or a {, or if there is * a backslash at the end of the buffer, or if we have reached the end * of the buffer, we assume that the initial * { was not a tag beginning, so we print the substitute of { */ WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0); /* And we rewind the current position after the { */ current_start_pos=old_position+1; } else { /* If we have read a sequence like {....}, we assume that there won't be * a buffer overflow if we add the } */ u_strcat(tmp,close_bracket); if (!u_strcmp(tmp,"{S}") || !u_strcmp(tmp,"{STOP}") || check_tag_token(tmp)) { /* If this is a special tag or a valid tag token, we just print * it to the output */ WriteOufBuf(&OutBuf,tmp,output, 0); current_start_pos++; } else { /* If we have a non valid tag token, we print the equivalent of { * and we rewind the current position after the { */ WriteOufBuf(&OutBuf,replacements->value[get_value_index(open_bracket,replacements)],output, 0); current_start_pos=old_position+1; } } } else { /* If we have a character that is not {, first we try to look if there * is a replacement to do */ int key_length; int index=get_longest_key_index(&buff[current_start_pos],&key_length,replacements); if (index!=NO_VALUE_INDEX) { /* If there is something to replace */ WriteOufBuf(&OutBuf,replacements->value[index],output, 0); current_start_pos=current_start_pos+key_length; } else { if (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { /* If we have a separator, we try to read the longest separator sequence * that we can read. By the way, we note if it contains a new line */ int new_line=0; while (buff[current_start_pos]==' ' || buff[current_start_pos]=='\t' || buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { /* Note 1: no bound check is needed, since an unichar buffer is always * ended by a \0 * * Note 2: we don't take into account the case of a buffer ended by * separator while it's not the end of file: that would mean * that the text contains something like MARGIN_BEFORE_BUFFER_END * contiguous separators. Such a text would not be a reasonable one. */ if (buff[current_start_pos]=='\n' || buff[current_start_pos]==0x0d) { new_line=1; } current_start_pos++; } if (new_line && (carriage_return_policy==KEEP_CARRIAGE_RETURN)) { /* We print a new line if the sequence contains one and if we are * allowed to; otherwise, we print a space. */ WriteOufBuf(&OutBuf,'\n',output, 0); } else { WriteOufBuf(&OutBuf,' ',output, 0); } } else { /* If, finally, we have a normal character to normalize, we just print it */ WriteOufBuf(&OutBuf,buff[current_start_pos++],output, 0); } } } } } WriteOufBuf(&OutBuf,empty_string,output, 1); free(line_read); free_string_hash(replacements); u_fclose(input); u_fclose(output); return 0; }
int main_PolyLex(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } int language=-1; char alphabet[FILENAME_MAX]=""; char name_bin[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; char info[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_PolyLex,lopts_PolyLex,&index))) { switch(val) { case 'D': language=DUTCH; break; case 'G': language=GERMAN; break; case 'N': language=NORWEGIAN; break; case 'R': language=RUSSIAN; break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'd': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty dictionary file name\n"); return USAGE_ERROR_CODE; } strcpy(name_bin,options.vars()->optarg); break; case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output file name\n"); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'i': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty information file name\n"); return USAGE_ERROR_CODE; } strcpy(info,options.vars()->optarg); break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_PolyLex[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (name_bin[0]=='\0') { error("You must specify the .bin dictionary to use\n"); return USAGE_ERROR_CODE; } if (output[0]=='\0') { error("You must specify the output dictionary file name\n"); return USAGE_ERROR_CODE; } if (language==-1) { error("You must specify the language\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } Alphabet* alph=NULL; if (alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); alph=load_alphabet(&vec,alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); return USAGE_ERROR_CODE; } } char name_inf[FILENAME_MAX]; struct string_hash* forbiddenWords=NULL; if (language==DUTCH || language==NORWEGIAN) { get_path(name_bin,name_inf); strcat(name_inf,"ForbiddenWords.txt"); forbiddenWords=load_key_list(&vec,name_inf); if (forbiddenWords==NULL) { /* If there was no file, we don't want to block the process */ forbiddenWords=new_string_hash(DONT_USE_VALUES); } } strcpy(name_inf,name_bin); name_inf[strlen(name_bin)-3]='\0'; strcat(name_inf,"inf"); Dictionary* d=new_Dictionary(&vec,name_bin,name_inf); if (d==NULL) { error("Cannot load dictionary %s\n",name_bin); free_string_hash(forbiddenWords); free_alphabet(alph); return DEFAULT_ERROR_CODE; } char tmp[FILENAME_MAX]; strcpy(tmp,argv[options.vars()->optind]); strcat(tmp,".tmp"); U_FILE* words=u_fopen(&vec,argv[options.vars()->optind],U_READ); if (words==NULL) { error("Cannot open word list file %s\n",argv[options.vars()->optind]); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); // here we return 0 in order to do not block the preprocessing // in the Unitex/GramLab IDE interface, if no dictionary was applied // so that there is no "err" file return SUCCESS_RETURN_CODE; } U_FILE* new_unknown_words=u_fopen(&vec,tmp,U_WRITE); if (new_unknown_words==NULL) { error("Cannot open temporary word list file %s\n",tmp); u_fclose(words); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); return DEFAULT_ERROR_CODE; } U_FILE* res=u_fopen(&vec,output,U_APPEND); if (res==NULL) { error("Cannot open result file %s\n",output); u_fclose(new_unknown_words); u_fclose(words); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); u_fclose(words); return DEFAULT_ERROR_CODE; } U_FILE* debug=NULL; if ((*info)!='\0') { debug=u_fopen(&vec,info,U_WRITE); if (debug==NULL) { error("Cannot open debug file %s\n",info); } } struct utags UTAG; switch(language) { case DUTCH: analyse_dutch_unknown_words(alph, d, words, res, debug, new_unknown_words, forbiddenWords); break; case GERMAN: analyse_german_compounds(alph, d, words, res, debug, new_unknown_words); break; case NORWEGIAN: analyse_norwegian_unknown_words(alph, d, words, res, debug, new_unknown_words, forbiddenWords); break; case RUSSIAN: init_russian(&UTAG); analyse_compounds(alph, d, words, res, debug, new_unknown_words, UTAG); break; } free_alphabet(alph); free_Dictionary(d); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); af_remove(argv[options.vars()->optind]); af_rename(tmp,argv[options.vars()->optind]); u_fclose(res); if (debug!=NULL) { u_fclose(debug); } return SUCCESS_RETURN_CODE; }
// // this function builds the normalization grammar adapted to the match list // passed in parameter // void build_portuguese_normalization_grammar(const Alphabet* alph,struct match_list* list,const unsigned char* root_bin, const struct INF_codes* root_inf,const unsigned char* inflected_bin, const struct INF_codes* inflected_inf,const char* res_grf_name, Encoding encoding_output, int bom_output, struct normalization_tree* norm_tree, struct normalization_tree* nasal_norm_tree) { DISCARD_UNUSED_PARAMETER(nasal_norm_tree) struct match_list* L=list; int N=0; unichar temp[2000]; unichar prefix[2000]; struct string_hash* hash=new_string_hash(); while (L!=NULL) { if (L->output!=NULL) { // first, we normalize the sequences by removing all spaces u_strcpy_without_space(temp,L->output); u_strcpy(L->output,temp); // then we check if this sequence has already been processed int J=get_value_index(L->output,hash,DONT_INSERT); if (J!=-1) { // if the sequence has already been analyzed, we do nothing } else { get_value_index(L->output,hash); get_bracket_prefix(L->output,prefix); if (!u_strcmp(prefix,"FuturConditional")) { N=N+replace_match_output_by_normalization_line(L,alph,root_bin,root_inf,inflected_bin,