cassys_tokens_list *add_replaced_text( const char *text, cassys_tokens_list *list, int transducer_id, const char *alphabet_name,int mask_encoding_compatibility_input) { Alphabet *alphabet = load_alphabet(alphabet_name); struct snt_files *snt_text_files = new_snt_files(text); struct fifo *stage_concord = read_concord_file(snt_text_files->concord_ind, mask_encoding_compatibility_input); // performance enhancement cassys_tokens_list *current_list_position = list; long current_token_position = 0; int nb_sentence = 0; while (!is_empty(stage_concord)) { nb_sentence++; locate_pos *l = (locate_pos*) take_ptr(stage_concord); struct list_ustring *new_sentence_lu = cassys_tokenize_word_by_word(l->label, alphabet); cassys_tokens_list *new_sentence_ctl = new_list(new_sentence_lu, transducer_id); // performance enhancement : // Since matches are sorted, we begin the search from the last known position in the list. // We have to substract from the text position the current token position. cassys_tokens_list *list_position = get_element_at(current_list_position, transducer_id - 1, l->token_start_offset - current_token_position); int replaced_sentence_length = l->token_end_offset - l->token_start_offset+1; int new_sentence_length = length(new_sentence_lu); add_output(list_position, new_sentence_ctl, transducer_id, replaced_sentence_length, new_sentence_length-1); // performance enhancement current_list_position = list_position; current_token_position = l-> token_start_offset; free(l->label); free(l); free_list_ustring(new_sentence_lu); } free_fifo(stage_concord); free_snt_files(snt_text_files); free_alphabet(alphabet); return list; }
/** * Loads an alphabet file and returns the associated 'Alphabet*' structure. * If 'korean' is non null, we compute the equivalences between Chinese and Hangul * characters. */ Alphabet* load_alphabet(const VersatileEncodingConfig* vec,const char* filename,int korean) { void* a=get_persistent_structure(filename); if (a!=NULL) { return (Alphabet*)a; } U_FILE* f; f=u_fopen(vec,filename,U_READ); if (f==NULL) { return NULL; } Alphabet* alphabet=new_alphabet(korean); int c; unichar lower,upper; while ((c=u_fgetc(f))!=EOF) { upper=(unichar)c; if (upper=='\n') { /* We skip empty lines */ continue; } if (upper=='#') { // we are in the case of an interval #AZ -> [A..Z] lower=(unichar)u_fgetc(f); upper=(unichar)u_fgetc(f); if (lower>upper) { error("Error in alphabet file: for an interval like #AZ, A must be before Z\n"); free_alphabet(alphabet); u_fclose(f); return NULL; } for (c=lower;c<=upper;c++) { SET_CASE_FLAG_MACRO(c,alphabet,1|2); add_letter_equivalence(alphabet,(unichar)c,(unichar)c); } u_fgetc(f); // reading the \n } else { SET_CASE_FLAG_MACRO(upper,alphabet,1); lower=(unichar)u_fgetc(f); if (lower!='\n') { SET_CASE_FLAG_MACRO(lower,alphabet,2); u_fgetc(f); // reading the \n add_letter_equivalence(alphabet,lower,upper); } else { // we are in the case of a single (no min/maj distinction like in thai) SET_CASE_FLAG_MACRO(upper,alphabet,2); add_letter_equivalence(alphabet,upper,upper); } } } u_fclose(f); return alphabet; }
/** * Frees the given structure */ void free_fst2txt_parameters(struct fst2txt_parameters* p) { if (p==NULL) return; free(p->text_file); free(p->temp_file); free(p->fst_file); free(p->alphabet_file); for (int i=0;i<p->n_token_trees;i++) { free_fst2txt_token_tree(p->token_tree[i]); } if (p->token_tree!=NULL) { free(p->token_tree); } free_Variables(p->variables); free_buffer(p->text_buffer); free_abstract_Fst2(p->fst2,NULL); free_alphabet(p->alphabet); free_stack_unichar(p->stack); free(p); }
/** * Deallocate the quantizer list as well as any alphabets or pmfs that are stored * @param list The conditional quantizer list to deallocate */ void free_cond_quantizer_list(struct cond_quantizer_list_t *list) { uint32_t i, j; for (i = 0; i < list->columns; ++i) { if (list->q[i]) { for (j = 0; j < list->input_alphabets[i]->size; ++j) { if (list->q[i][j]) free_quantizer(list->q[i][j]); } free_alphabet(list->input_alphabets[i]); free(list->q[i]); free(list->ratio[i]); free(list->qratio[i]); } } free(list->qratio); free(list->ratio); free(list->q); free(list->input_alphabets); free(list); }
int main_PolyLex(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } int language=-1; char alphabet[FILENAME_MAX]=""; char name_bin[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; char info[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_PolyLex,lopts_PolyLex,&index))) { switch(val) { case 'D': language=DUTCH; break; case 'G': language=GERMAN; break; case 'N': language=NORWEGIAN; break; case 'R': language=RUSSIAN; break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'd': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty dictionary file name\n"); return USAGE_ERROR_CODE; } strcpy(name_bin,options.vars()->optarg); break; case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output file name\n"); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'i': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty information file name\n"); return USAGE_ERROR_CODE; } strcpy(info,options.vars()->optarg); break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_PolyLex[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (name_bin[0]=='\0') { error("You must specify the .bin dictionary to use\n"); return USAGE_ERROR_CODE; } if (output[0]=='\0') { error("You must specify the output dictionary file name\n"); return USAGE_ERROR_CODE; } if (language==-1) { error("You must specify the language\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } Alphabet* alph=NULL; if (alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); alph=load_alphabet(&vec,alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); return USAGE_ERROR_CODE; } } char name_inf[FILENAME_MAX]; struct string_hash* forbiddenWords=NULL; if (language==DUTCH || language==NORWEGIAN) { get_path(name_bin,name_inf); strcat(name_inf,"ForbiddenWords.txt"); forbiddenWords=load_key_list(&vec,name_inf); if (forbiddenWords==NULL) { /* If there was no file, we don't want to block the process */ forbiddenWords=new_string_hash(DONT_USE_VALUES); } } strcpy(name_inf,name_bin); name_inf[strlen(name_bin)-3]='\0'; strcat(name_inf,"inf"); Dictionary* d=new_Dictionary(&vec,name_bin,name_inf); if (d==NULL) { error("Cannot load dictionary %s\n",name_bin); free_string_hash(forbiddenWords); free_alphabet(alph); return DEFAULT_ERROR_CODE; } char tmp[FILENAME_MAX]; strcpy(tmp,argv[options.vars()->optind]); strcat(tmp,".tmp"); U_FILE* words=u_fopen(&vec,argv[options.vars()->optind],U_READ); if (words==NULL) { error("Cannot open word list file %s\n",argv[options.vars()->optind]); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); // here we return 0 in order to do not block the preprocessing // in the Unitex/GramLab IDE interface, if no dictionary was applied // so that there is no "err" file return SUCCESS_RETURN_CODE; } U_FILE* new_unknown_words=u_fopen(&vec,tmp,U_WRITE); if (new_unknown_words==NULL) { error("Cannot open temporary word list file %s\n",tmp); u_fclose(words); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); return DEFAULT_ERROR_CODE; } U_FILE* res=u_fopen(&vec,output,U_APPEND); if (res==NULL) { error("Cannot open result file %s\n",output); u_fclose(new_unknown_words); u_fclose(words); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); u_fclose(words); return DEFAULT_ERROR_CODE; } U_FILE* debug=NULL; if ((*info)!='\0') { debug=u_fopen(&vec,info,U_WRITE); if (debug==NULL) { error("Cannot open debug file %s\n",info); } } struct utags UTAG; switch(language) { case DUTCH: analyse_dutch_unknown_words(alph, d, words, res, debug, new_unknown_words, forbiddenWords); break; case GERMAN: analyse_german_compounds(alph, d, words, res, debug, new_unknown_words); break; case NORWEGIAN: analyse_norwegian_unknown_words(alph, d, words, res, debug, new_unknown_words, forbiddenWords); break; case RUSSIAN: init_russian(&UTAG); analyse_compounds(alph, d, words, res, debug, new_unknown_words, UTAG); break; } free_alphabet(alph); free_Dictionary(d); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); af_remove(argv[options.vars()->optind]); af_rename(tmp,argv[options.vars()->optind]); u_fclose(res); if (debug!=NULL) { u_fclose(debug); } return SUCCESS_RETURN_CODE; }
int main_MultiFlex(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } char output[FILENAME_MAX]=""; char config_dir[FILENAME_MAX]=""; char alphabet[FILENAME_MAX]=""; char pkgdir[FILENAME_MAX]=""; char* named=NULL; int is_korean=0; // default policy is to compile only out of date graphs GraphRecompilationPolicy graph_recompilation_policy = ONLY_OUT_OF_DATE; //Current language's alphabet int error_check_status=SIMPLE_AND_COMPOUND_WORDS; VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_MultiFlex,lopts_MultiFlex,&index))) { switch(val) { case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty DELAF file name\n"); free(named); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); free(named); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'd': strcpy(config_dir,options.vars()->optarg); break; case 'K': is_korean=1; break; case 's': error_check_status=ONLY_SIMPLE_WORDS; break; case 'c': error_check_status=ONLY_COMPOUND_WORDS; break; case 'f': graph_recompilation_policy = ALWAYS_RECOMPILE; break; case 'n': graph_recompilation_policy = NEVER_RECOMPILE; break; case 't': graph_recompilation_policy = ONLY_OUT_OF_DATE; break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); free(named); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); free(named); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'p': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty package directory name\n"); free(named); return USAGE_ERROR_CODE; } strcpy(pkgdir,options.vars()->optarg); break; case 'r': if (named==NULL) { named=strdup(options.vars()->optarg); if (named==NULL) { alloc_error("main_Grf2Fst2"); return ALLOC_ERROR_CODE; } } else { char* more_names = (char*)realloc((void*)named,strlen(named)+strlen(options.vars()->optarg)+2); if (more_names) { named = more_names; } else { alloc_error("main_MultiFlex"); free(named); return ALLOC_ERROR_CODE; } strcat(named,";"); strcat(named,options.vars()->optarg); } break; case 'V': only_verify_arguments = true; break; case 'h': usage(); free(named); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_MultiFlex[index].name); free(named); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); free(named); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); free(named); return USAGE_ERROR_CODE; } if (output[0]=='\0') { error("You must specify the output DELAF name\n"); free(named); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory free(named); return SUCCESS_RETURN_CODE; } //Load morphology description char morphology[FILENAME_MAX]; new_file(config_dir,"Morphology.txt",morphology); //int config_files_status=CONFIG_FILES_OK; Alphabet* alph=NULL; if (alphabet[0]!='\0') { //Load alphabet alph=load_alphabet(&vec,alphabet,1); //To be done once at the beginning of the inflection if (alph==NULL) { error("Cannot open alphabet file %s\n",alphabet); free(named); return DEFAULT_ERROR_CODE; } } //Init equivalence files char equivalences[FILENAME_MAX]; new_file(config_dir,"Equivalences.txt",equivalences); /* Korean */ Korean* korean=NULL; if (is_korean) { if (alph==NULL) { error("Cannot initialize Korean data with a NULL alphabet\n"); free(named); return DEFAULT_ERROR_CODE; } korean=new Korean(alph); } MultiFlex_ctx* p_multiFlex_ctx=new_MultiFlex_ctx(config_dir, morphology, equivalences, &vec, korean, pkgdir, named, graph_recompilation_policy); //DELAC inflection int return_value = inflect(argv[options.vars()->optind],output,p_multiFlex_ctx,alph,error_check_status); free(named); for (int count_free_fst2=0;count_free_fst2<p_multiFlex_ctx->n_fst2;count_free_fst2++) { free_abstract_Fst2(p_multiFlex_ctx->fst2[count_free_fst2],&(p_multiFlex_ctx->fst2_free[count_free_fst2])); p_multiFlex_ctx->fst2[count_free_fst2] = NULL; } free_alphabet(alph); free_MultiFlex_ctx(p_multiFlex_ctx); if (korean!=NULL) { delete korean; } u_printf("Done.\n"); return return_value; }
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err, const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy, Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input, const char* dynamicDir,TokenizationPolicy tokenization_policy, SpacePolicy space_policy,int search_limit,const char* morpho_dic_list, AmbiguousOutputPolicy ambiguous_output_policy, VariableErrorPolicy variable_error_policy,int protect_dic_chars, int is_korean,int max_count_call,int max_count_call_warning, char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) { U_FILE* out; U_FILE* info; struct locate_parameters* p=new_locate_parameters(); p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0); p->buffer=(int*)af_get_mapfile_pointer(p->text_cod); long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int); p->buffer_size=(int)text_size; p->tilde_negation_operator=tilde_negation_operator; p->useLocateCache=useLocateCache; if (max_count_call == -1) { max_count_call = (int)text_size; } if (max_count_call_warning == -1) { max_count_call_warning = (int)text_size; } p->match_policy=match_policy; p->tokenization_policy=tokenization_policy; p->space_policy=space_policy; p->output_policy=output_policy; p->search_limit=search_limit; p->ambiguous_output_policy=ambiguous_output_policy; p->variable_error_policy=variable_error_policy; p->protect_dic_chars=protect_dic_chars; p->mask_encoding_compatibility_input = mask_encoding_compatibility_input; p->max_count_call = max_count_call; p->max_count_call_warning = max_count_call_warning; p->token_filename = tokens; char concord[FILENAME_MAX]; char concord_info[FILENAME_MAX]; strcpy(concord,dynamicDir); strcat(concord,"concord.ind"); strcpy(concord_info,dynamicDir); strcat(concord_info,"concord.n"); char morpho_bin[FILENAME_MAX]; strcpy(morpho_bin,dynamicDir); strcat(morpho_bin,"morpho.bin"); if (arabic_rules!=NULL && arabic_rules[0]!='\0') { load_arabic_typo_rules(arabic_rules,&(p->arabic)); } out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE); if (out==NULL) { error("Cannot write %s\n",concord); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); u_fclose(out); return 0; } info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE); if (info==NULL) { error("Cannot write %s\n",concord_info); } switch(output_policy) { case IGNORE_OUTPUTS: u_fprintf(out,"#I\n"); break; case MERGE_OUTPUTS: u_fprintf(out,"#M\n"); break; case REPLACE_OUTPUTS: u_fprintf(out,"#R\n"); break; } if (alphabet!=NULL && alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); p->alphabet=load_alphabet(alphabet,is_korean); if (p->alphabet==NULL) { error("Cannot load alphabet file %s\n",alphabet); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } } struct string_hash* semantic_codes=new_string_hash(); extract_semantic_codes(dlf,semantic_codes); extract_semantic_codes(dlc,semantic_codes); if (is_cancelling_requested() != 0) { error("user cancel request.\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } u_printf("Loading fst2...\n"); struct FST2_free_info fst2load_free; Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free); if (fst2load==NULL) { error("Cannot load grammar %s\n",fst2_name); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered); p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator); free_abstract_Fst2(fst2load,&fst2load_free); if (is_cancelling_requested() != 0) { error("User cancel request..\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } p->tags=p->fst2->tags; #ifdef TRE_WCHAR p->filters=new_FilterSet(p->fst2,p->alphabet); if (p->filters==NULL) { error("Cannot compile filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_stack_unichar(p->stack); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif u_printf("Loading token list...\n"); int n_text_tokens=0; p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens); if (p->tokens==NULL) { error("Cannot load token list %s\n",tokens); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator; p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator); memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache)); if (p->match_cache==NULL) { fatal_alloc_error("locate_pattern"); } #ifdef TRE_WCHAR p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens); if (p->filter_match_index==NULL) { error("Cannot optimize filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_string_hash(p->tokens); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif if (allow_trace!=0) { open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace); } extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator); u_printf("Loading morphological dictionaries...\n"); load_morphological_dictionaries(morpho_dic_list,p,morpho_bin); extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator); p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char)); if (p->token_control==NULL) { fatal_alloc_error("locate_pattern"); } p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*)); if (p->matching_patterns==NULL) { fatal_alloc_error("locate_pattern"); } for (int i=0; i<n_text_tokens; i++) { p->token_control[i]=0; p->matching_patterns[i]=NULL; } compute_token_controls(p->alphabet,err,p); int number_of_patterns,is_DIC,is_CDIC,is_SDIC; p->pattern_tree_root=new_pattern_node(locate_abstract_allocator); u_printf("Computing fst2 tags...\n"); process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator); p->current_compound_pattern=number_of_patterns; p->DLC_tree=new_DLC_tree(p->tokens->size); struct lemma_node* root=new_lemma_node(); u_printf("Loading dlf...\n"); load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); u_printf("Loading dlc...\n"); load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); /* We look if tag tokens like "{today,.ADV}" verify some patterns */ check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator); u_printf("Optimizing fst2 pattern tags...\n"); optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator); u_printf("Optimizing compound word dictionary...\n"); optimize_DLC(p->DLC_tree); free_string_hash(semantic_codes); int nb_input_variable=0; p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable); p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables); Abstract_allocator locate_recycle_abstract_allocator=NULL; locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle", AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject, get_prefered_allocator_item_size_for_nb_variable(nb_input_variable)); u_printf("Optimizing fst2...\n"); p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator); if (is_korean) { p->korean=new Korean(p->alphabet); p->jamo_tags=create_jamo_tags(p->korean,p->tokens); } p->failfast=new_bit_array(n_text_tokens,ONE_BIT); u_printf("Working...\n"); p->prv_alloc=locate_work_abstract_allocator; p->prv_alloc_recycle=locate_recycle_abstract_allocator; launch_locate(out,text_size,info,p); if (allow_trace!=0) { close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace); } free_bit_array(p->failfast); free_Variables(p->input_variables); free_OutputVariables(p->output_variables); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); if (p->match_cache!=NULL) { for (int i=0; i<p->tokens->size; i++) { free_LocateCache(p->match_cache[i],locate_work_abstract_allocator); } free_cb(p->match_cache,locate_work_abstract_allocator); } int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1; if (free_abstract_allocator_item) { free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator); } free_stack_unichar(p->stack); /** Too long to free the DLC tree if it is big * free_DLC_tree(p->DLC_tree); */ if (free_abstract_allocator_item) { free_pattern_node(p->pattern_tree_root,locate_abstract_allocator); free_Fst2(p->fst2,locate_abstract_allocator); free_list_int(p->tag_token_list,locate_abstract_allocator); } close_abstract_allocator(locate_abstract_allocator); close_abstract_allocator(locate_recycle_abstract_allocator); locate_recycle_abstract_allocator=locate_abstract_allocator=NULL; /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */ free_alphabet(p->alphabet); if (p->korean!=NULL) { delete p->korean; } if (p->jamo_tags!=NULL) { /* jamo tags must be freed before tokens, because we need to know how * many jamo tags there are, and this number is the number of tokens */ for (int i=0; i<p->tokens->size; i++) { free(p->jamo_tags[i]); } free(p->jamo_tags); } free_string_hash(p->tokens); free_lemma_node(root); free(p->token_control); for (int i=0; i<n_text_tokens; i++) { free_bit_array(p->matching_patterns[i]); } free(p->matching_patterns); #ifdef TRE_WCHAR free_FilterSet(p->filters); free_FilterMatchIndex(p->filter_match_index); #endif for (int i=0; i<p->n_morpho_dics; i++) { free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i])); free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i])); } free(p->morpho_dic_inf); free(p->morpho_dic_inf_free); free(p->morpho_dic_bin); free(p->morpho_dic_bin_free); #if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT)) free_DLC_tree(p->DLC_tree); #endif free_locate_parameters(p); u_printf("Done.\n"); return 1; }
int main_Untokenize(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } char alphabet[FILENAME_MAX]=""; char token_file[FILENAME_MAX]=""; char dynamicSntDir[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; int range_start,range_stop,use_range; int token_step_number=0; range_start=range_stop=use_range=0; char foo=0; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_Untokenize,lopts_Untokenize,&index))) { switch(val) { case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'd': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty snt dir name\n"); return USAGE_ERROR_CODE; } strcpy(dynamicSntDir,options.vars()->optarg); break; case 't': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty token file name\n"); return USAGE_ERROR_CODE; } strcpy(token_file,options.vars()->optarg); break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&token_step_number,&foo) || token_step_number<=0) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid token numbering argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; case 'r': { int param1 = 0; int param2 = 0; int ret_scan = sscanf(options.vars()->optarg,"%d,%d%c",¶m1,¶m2,&foo); if (ret_scan == 2) { range_start = param1; range_stop = param2; use_range=1; if (((range_start < -1)) || (range_stop < -1)) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid stop count argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } } else if (1!=sscanf(options.vars()->optarg,"%d%c",&range_start,&foo) || (range_start < -1)) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid stop count argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } use_range=1; } break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Untokenize[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } char tokens_txt[FILENAME_MAX]; char text_cod[FILENAME_MAX]; char enter_pos[FILENAME_MAX]; if (dynamicSntDir[0]=='\0') { get_snt_path(argv[options.vars()->optind],dynamicSntDir); } strcpy(text_cod,dynamicSntDir); strcat(text_cod,"text.cod"); strcpy(enter_pos,dynamicSntDir); strcat(enter_pos,"enter.pos"); strcpy(tokens_txt,dynamicSntDir); strcat(tokens_txt,"tokens.txt"); Alphabet* alph=NULL; if (alphabet[0]!='\0') { alph=load_alphabet(&vec,alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); return DEFAULT_ERROR_CODE; } } ABSTRACTMAPFILE* af_text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0); if (af_text_cod==NULL) { error("Cannot open file %s\n",text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } ABSTRACTMAPFILE* af_enter_pos=af_open_mapfile(enter_pos,MAPFILE_OPTION_READ,0); if (af_enter_pos==NULL) { error("Cannot open file %s\n",enter_pos); af_close_mapfile(af_text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } U_FILE* text = u_fopen(&vec,argv[options.vars()->optind],U_WRITE); if (text==NULL) { error("Cannot create text file %s\n",argv[options.vars()->optind]); af_close_mapfile(af_enter_pos); af_close_mapfile(af_text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } struct text_tokens* tok=load_text_tokens(&vec,tokens_txt); u_printf("Untokenizing text...\n"); size_t nb_item = af_get_mapfile_size(af_text_cod)/sizeof(int); const int* buf=(const int*)af_get_mapfile_pointer(af_text_cod); size_t nb_item_enter_pos=0; const int* buf_enter=NULL; if (af_enter_pos!=NULL) { buf_enter=(const int*)af_get_mapfile_pointer(af_enter_pos); if (buf_enter!=NULL) { nb_item_enter_pos=af_get_mapfile_size(af_enter_pos)/sizeof(int); } } size_t count_pos=0; for (size_t i=0;i<nb_item;i++) { int is_in_range=1; if ((use_range!=0) && (i<(size_t)range_start)) { is_in_range=0; } if ((use_range!=0) && (range_stop!=0) && (i>(size_t)range_stop)) { is_in_range=0; } int is_newline=0; if (count_pos<nb_item_enter_pos) { if (i==(size_t)(*(buf_enter+count_pos))) { is_newline = 1; count_pos++; } } if (is_in_range!=0) { if (token_step_number != 0) if ((i%token_step_number)==0) u_fprintf(text,"\n\nToken %d : ", (int)i); if (is_newline!=0) { u_fprintf(text,"\n", tok->token[*(buf+i)]); } else { u_fputs(tok->token[*(buf+i)], text); } } } af_release_mapfile_pointer(af_text_cod,buf); af_release_mapfile_pointer(af_enter_pos,buf_enter); af_close_mapfile(af_enter_pos); af_close_mapfile(af_text_cod); free_text_tokens(tok); u_fclose(text); free_alphabet(alph); u_printf("\nDone.\n"); return SUCCESS_RETURN_CODE; }
void free_persistent_alphabet(const char* name) { Alphabet* a=(Alphabet*)get_persistent_structure(name); set_persistent_structure(name,NULL); free_alphabet(a); }
int main_CheckDic(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int is_a_DELAF=-1; int strict_unprotected=0; int skip_path=0; char alph[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; int space_warnings=1; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_CheckDic,lopts_CheckDic,&index,vars))) { switch(val) { case 'f': is_a_DELAF=1; break; case 's': is_a_DELAF=0; break; case 'h': usage(); return 0; case 'r': strict_unprotected=1; break; case 't': strict_unprotected=0; break; case 'n': space_warnings=0; break; case 'p': skip_path=1; break; case 'a': if (vars->optarg[0]=='\0') { fatal_error("Empty alphabet argument\n"); } strcpy(alph,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_CheckDic[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (is_a_DELAF==-1 || vars->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return 1; } U_FILE* dic=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (dic==NULL) { fatal_error("Cannot open dictionary %s\n",argv[vars->optind]); } Alphabet* alphabet0=NULL; if (alph[0]!='\0') { alphabet0=load_alphabet(alph,1); } char output_filename[FILENAME_MAX]; get_path(argv[vars->optind],output_filename); strcat(output_filename,"CHECK_DIC.TXT"); U_FILE* out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output_filename,U_WRITE); if (out==NULL) { u_fclose(dic); fatal_error("Cannot create %s\n",output_filename); } u_printf("Checking %s...\n",argv[vars->optind]); unichar line[CHECKDIC_LINE_SIZE]; int line_number=1; /* * We declare and initialize an array in order to know which * letters are used in the dictionary. */ int i; char* alphabet=(char*)malloc(sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS); if (alphabet==NULL) { fatal_alloc_error("CheckDic's main"); } memset(alphabet,0,sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS); /* * We use two structures for the storage of the codes found in the * dictionary. Note that 'semantic_codes' is used to store both grammatical and * semantic codes. */ struct string_hash* semantic_codes=new_string_hash(); struct string_hash* inflectional_codes=new_string_hash(); struct string_hash* simple_lemmas=new_string_hash(DONT_USE_VALUES); struct string_hash* compound_lemmas=new_string_hash(DONT_USE_VALUES); int n_simple_entries=0; int n_compound_entries=0; /* * We read all the lines and check them. */ while (EOF!=u_fgets_limit2(line,DIC_LINE_SIZE,dic)) { if (line[0]=='\0') { /* If we have an empty line, we print a unicode error message * into the output file */ u_fprintf(out,"Line %d: empty line\n",line_number); } else if (line[0]=='/') { /* If a line starts with '/', it is a commment line, so * we ignore it */ } else { /* If we have a line to check, we check it according to the * dictionary type */ check_DELA_line(line,out,is_a_DELAF,line_number,alphabet,semantic_codes, inflectional_codes,simple_lemmas,compound_lemmas, &n_simple_entries,&n_compound_entries,alphabet0,strict_unprotected); } /* At regular intervals, we display a message on the standard * output to show that the program is working */ if (line_number%10000==0) { u_printf("%d lines read...\r",line_number); } line_number++; } u_printf("%d lines read\n",line_number-1); u_fclose(dic); /* * Once we have checked all the lines, we print some informations * in the output file. */ u_fprintf(out,"-----------------------------------\n"); u_fprintf(out,"------------- Stats -------------\n"); u_fprintf(out,"-----------------------------------\n"); if (skip_path != 0) { char filename_without_path[FILENAME_MAX]; remove_path(argv[vars->optind],filename_without_path); u_fprintf(out,"File: %s\n",filename_without_path); } else { u_fprintf(out,"File: %s\n",argv[vars->optind]); } u_fprintf(out,"Type: %s\n",is_a_DELAF?"DELAF":"DELAS"); u_fprintf(out,"%d line%s read\n",line_number-1,(line_number-1>1)?"s":""); u_fprintf(out,"%d simple entr%s ",n_simple_entries,(n_simple_entries>1)?"ies":"y"); u_fprintf(out,"for %d distinct lemma%s\n",simple_lemmas->size,(simple_lemmas->size>1)?"s":""); u_fprintf(out,"%d compound entr%s ",n_compound_entries,(n_compound_entries>1)?"ies":"y"); u_fprintf(out,"for %d distinct lemma%s\n",compound_lemmas->size,(compound_lemmas->size>1)?"s":""); /** * We print the list of the characters that are used, with * their unicode numbers shown in hexadecimal. This can be useful * to detect different characters that are graphically identical * like 'A' (upper of latin 'a' or upper of greek alpha ?). */ u_fprintf(out,"-----------------------------------\n"); u_fprintf(out,"---- All chars used in forms ----\n"); u_fprintf(out,"-----------------------------------\n"); unichar r[4]; unichar r2[7]; r[1]=' '; r[2]='('; r[3]='\0'; r2[5]='\n'; r2[6]='\0'; for (i=0;i<MAX_NUMBER_OF_UNICODE_CHARS;i++) { if (alphabet[i]) { u_fprintf(out,"%C (%04X)\n",i,i); } } /* * Then we print the list of all grammatical and semantic codes used in the * dictionary. If a code contains a non ASCII character, a space or a tabulation, * we print a warning. */ u_fprintf(out,"-------------------------------------------------------------\n"); u_fprintf(out,"---- %3d grammatical/semantic code%s",semantic_codes->size,(semantic_codes->size>1)?"s used in dictionary ----\n":" used in dictionary -----\n"); u_fprintf(out,"-------------------------------------------------------------\n"); unichar comment[2000]; for (i=0;i<semantic_codes->size;i++) { /* We print the code, followed if necessary by a warning */ u_fprintf(out,"%S",semantic_codes->value[i]); if (warning_on_code(semantic_codes->value[i],comment,space_warnings)) { u_fprintf(out," %S",comment); } u_fprintf(out,"\n"); } /* * Finally, we print the list of inflectional codes, * with warnings in the case of non ASCII letters, spaces * or tabulations. */ u_fprintf(out,"-----------------------------------------------------\n"); u_fprintf(out,"---- %3d inflectional code%s",inflectional_codes->size,(inflectional_codes->size>1)?"s used in dictionary ----\n":" used in dictionary -----\n"); u_fprintf(out,"-----------------------------------------------------\n"); for (i=0;i<inflectional_codes->size;i++) { u_fprintf(out,"%S",inflectional_codes->value[i]); if (warning_on_code(inflectional_codes->value[i],comment,space_warnings)) { u_fprintf(out," %S",comment); } u_fprintf(out,"\n"); } u_fclose(out); free_OptVars(vars); u_printf("Done.\n"); /* Note that we don't free anything since it would only waste time */ free(alphabet); if (alphabet0!=NULL) { free_alphabet(alphabet0); } #if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT)) /* cleanup for no leak on library */ free_string_hash(semantic_codes); free_string_hash(inflectional_codes); free_string_hash(simple_lemmas); free_string_hash(compound_lemmas); #endif return 0; }
/** * The same than main, but no call to setBufferMode. */ int main_KeyWords(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } VersatileEncodingConfig vec=VEC_DEFAULT; char tokens[FILENAME_MAX]; char output[FILENAME_MAX]=""; char alph[FILENAME_MAX]=""; char cdic[FILENAME_MAX]=""; unichar* code=u_strdup("XXX"); int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_KeyWords,lopts_KeyWords,&index))) { switch(val) { case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output\n"); free(code); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); free(code); return USAGE_ERROR_CODE; } strcpy(alph,options.vars()->optarg); break; case 'f': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty forbidden code\n"); free(code); return USAGE_ERROR_CODE; } free(code); code=u_strdup(options.vars()->optarg); break; case 'c': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty file name\n"); free(code); return USAGE_ERROR_CODE; } strcpy(cdic,options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); free(code); return SUCCESS_RETURN_CODE; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); free(code); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); free(code); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_KeyWords[index].name); free(code); return USAGE_ERROR_CODE; break; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); free(code); return USAGE_ERROR_CODE; break; } index=-1; } if (options.vars()->optind==argc || options.vars()->optind==argc-1) { error("Invalid arguments: rerun with --help\n"); free(code); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory free(code); return SUCCESS_RETURN_CODE; } Alphabet* alphabet=NULL; if (alph[0]!='\0') { alphabet=load_alphabet(&vec,alph); if (alphabet==NULL) { error("Cannot load alphabet file %s\n",alph); free(code); return DEFAULT_ERROR_CODE; } } strcpy(tokens,argv[(options.vars()->optind++)]); if (output[0]=='\0') { get_path(tokens,output); strcat(output,"keywords.txt"); } struct string_hash_ptr* keywords=load_tokens_by_freq(tokens,&vec); filter_non_letter_keywords(keywords,alphabet); if (cdic[0]!='\0') { load_compound_words(cdic,&vec,keywords); } for (;options.vars()->optind!=argc;(options.vars()->optind)++) { filter_keywords_with_dic(keywords,argv[options.vars()->optind],&vec,alphabet); } merge_case_equivalent_unknown_words(keywords,alphabet); struct string_hash* forbidden_lemmas=compute_forbidden_lemmas(keywords,code); remove_keywords_with_forbidden_lemma(keywords,forbidden_lemmas); free_string_hash(forbidden_lemmas); vector_ptr* sorted=sort_keywords(keywords); U_FILE* f_output=u_fopen(&vec,output,U_WRITE); if (f_output==NULL) { error("Cannot write in file %s\n",output); free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list); free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list); free_alphabet(alphabet); free(code); return DEFAULT_ERROR_CODE; } dump_keywords(sorted,f_output); u_fclose(f_output); free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list); free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list); free_alphabet(alphabet); free(code); return SUCCESS_RETURN_CODE; }
int main_PolyLex(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int language=-1; char alphabet[FILENAME_MAX]=""; char dictionary[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; char info[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_PolyLex,lopts_PolyLex,&index,vars))) { switch(val) { case 'D': language=DUTCH; break; case 'G': language=GERMAN; break; case 'N': language=NORWEGIAN; break; case 'R': language=RUSSIAN; break; case 'a': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty alphabet file name\n"); } strcpy(alphabet,vars->optarg); break; case 'd': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty dictionary file name\n"); } strcpy(dictionary,vars->optarg); break; case 'o': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty output file name\n"); } strcpy(output,vars->optarg); break; case 'i': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty information file name\n"); } strcpy(info,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_PolyLex[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } if (dictionary[0]=='\0') { fatal_error("You must specify the .bin dictionary to use\n"); } if (output[0]=='\0') { fatal_error("You must specify the output dictionary file name\n"); } if (language==-1) { fatal_error("You must specify the language\n"); } Alphabet* alph=NULL; if (alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); alph=load_alphabet(alphabet); if (alph==NULL) { fatal_error("Cannot load alphabet file %s\n",alphabet); } } char temp[FILENAME_MAX]; struct string_hash* forbiddenWords=NULL; if (language==DUTCH || language==NORWEGIAN) { get_path(dictionary,temp); strcat(temp,"ForbiddenWords.txt"); forbiddenWords=load_key_list(temp,mask_encoding_compatibility_input); } u_printf("Loading BIN file...\n"); struct BIN_free_info bin_free; const unsigned char* bin=load_abstract_BIN_file(dictionary,&bin_free); if (bin==NULL) { error("Cannot load bin file %s\n",dictionary); free_alphabet(alph); free_string_hash(forbiddenWords); return 1; } strcpy(temp,dictionary); temp[strlen(dictionary)-3]='\0'; strcat(temp,"inf"); u_printf("Loading INF file...\n"); struct INF_free_info inf_free; const struct INF_codes* inf=load_abstract_INF_file(temp,&inf_free); if (inf==NULL) { error("Cannot load inf file %s\n",temp); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_string_hash(forbiddenWords); return 1; } char tmp[FILENAME_MAX]; strcpy(tmp,argv[vars->optind]); strcat(tmp,".tmp"); U_FILE* words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (words==NULL) { error("Cannot open word list file %s\n",argv[vars->optind]); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); free_string_hash(forbiddenWords); // here we return 0 in order to do not block the preprocessing // in the Unitex Java interface, if no dictionary was applied // so that there is no "err" file return 0; } U_FILE* new_unknown_words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,tmp,U_WRITE); if (new_unknown_words==NULL) { error("Cannot open temporary word list file %s\n",tmp); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); u_fclose(words); free_string_hash(forbiddenWords); return 1; } U_FILE* res=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output,U_APPEND); if (res==NULL) { error("Cannot open result file %s\n",output); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); return 1; } U_FILE* debug=NULL; if (info!=NULL) { debug=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,info,U_WRITE); if (debug==NULL) { error("Cannot open debug file %s\n",info); } } struct utags UTAG; switch(language) { case DUTCH: analyse_dutch_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break; case GERMAN: analyse_german_compounds(alph,bin,inf,words,res,debug,new_unknown_words); break; case NORWEGIAN: analyse_norwegian_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break; case RUSSIAN: init_russian(&UTAG); analyse_compounds(alph,bin,inf,words,res,debug,new_unknown_words,UTAG); break; } free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); af_remove(argv[vars->optind]); af_rename(tmp,argv[vars->optind]); u_fclose(res); if (debug!=NULL) { u_fclose(debug); } free_OptVars(vars); return 0; }
int main_Tokenize(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } char alphabet[FILENAME_MAX]=""; char token_file[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; int mode=NORMAL; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Tokenize,lopts_Tokenize,&index,vars))) { switch(val) { case 'a': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty alphabet file name\n"); } strcpy(alphabet,vars->optarg); break; case 'c': mode=CHAR_BY_CHAR; break; case 'w': mode=NORMAL; break; case 't': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty token file name\n"); } strcpy(token_file,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_Tokenize[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } U_FILE* text; U_FILE* out; U_FILE* output; U_FILE* enter; char tokens_txt[FILENAME_MAX]; char text_cod[FILENAME_MAX]; char enter_pos[FILENAME_MAX]; Alphabet* alph=NULL; get_snt_path(argv[vars->optind],text_cod); strcat(text_cod,"text.cod"); get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tokens.txt"); get_snt_path(argv[vars->optind],enter_pos); strcat(enter_pos,"enter.pos"); text=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (text==NULL) { fatal_error("Cannot open text file %s\n",argv[vars->optind]); } if (alphabet[0]!='\0') { alph=load_alphabet(alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); u_fclose(text); return 1; } } out=u_fopen(BINARY,text_cod,U_WRITE); if (out==NULL) { error("Cannot create file %s\n",text_cod); u_fclose(text); if (alph!=NULL) { free_alphabet(alph); } return 1; } enter=u_fopen(BINARY,enter_pos,U_WRITE); if (enter==NULL) { error("Cannot create file %s\n",enter_pos); u_fclose(text); u_fclose(out); if (alph!=NULL) { free_alphabet(alph); } return 1; } vector_ptr* tokens=new_vector_ptr(4096); vector_int* n_occur=new_vector_int(4096); vector_int* n_enter_pos=new_vector_int(4096); struct hash_table* hashtable=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal, (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy); if (token_file[0]!='\0') { load_token_file(token_file,mask_encoding_compatibility_input,tokens,hashtable,n_occur); } output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot create file %s\n",tokens_txt); u_fclose(text); u_fclose(out); u_fclose(enter); if (alph!=NULL) { free_alphabet(alph); } free_hash_table(hashtable); free_vector_ptr(tokens,free); free_vector_int(n_occur); free_vector_int(n_enter_pos); return 1; } u_fprintf(output,"0000000000\n"); int SENTENCES=0; int TOKENS_TOTAL=0; int WORDS_TOTAL=0; int DIGITS_TOTAL=0; u_printf("Tokenizing text...\n"); if (mode==NORMAL) { normal_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos, &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL); } else { char_by_char_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos, &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL); } u_printf("\nDone.\n"); save_new_line_positions(enter,n_enter_pos); u_fclose(enter); u_fclose(text); u_fclose(out); u_fclose(output); write_number_of_tokens(tokens_txt,encoding_output,bom_output,tokens->nbelems); // we compute some statistics get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"stats.n"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { compute_statistics(output,tokens,alph,SENTENCES,TOKENS_TOTAL,WORDS_TOTAL,DIGITS_TOTAL); u_fclose(output); } // we save the tokens by frequence get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tok_by_freq.txt"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { sort_and_save_by_frequence(output,tokens,n_occur); u_fclose(output); } // we save the tokens by alphabetical order get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tok_by_alph.txt"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { sort_and_save_by_alph_order(output,tokens,n_occur); u_fclose(output); } free_hash_table(hashtable); free_vector_ptr(tokens,free); free_vector_int(n_occur); free_vector_int(n_enter_pos); if (alph!=NULL) { free_alphabet(alph); } free_OptVars(vars); return 0; }
/** * The same than main, but no call to setBufferMode. */ int main_BuildKrMwuDic(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int val,index=-1; char output[FILENAME_MAX]=""; char inflection_dir[FILENAME_MAX]=""; char alphabet[FILENAME_MAX]=""; char dic_bin[FILENAME_MAX]=""; char dic_inf[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_BuildKrMwuDic,lopts_BuildKrMwuDic,&index,vars))) { switch(val) { case 'o': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty output file name\n"); } strcpy(output,vars->optarg); break; case 'd': if (vars->optarg[0]=='\0') { fatal_error("Empty inflection directory\n"); } strcpy(inflection_dir,vars->optarg); break; case 'a': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty alphabet file name\n"); } strcpy(alphabet,vars->optarg); break; case 'b': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty binary dictionary name\n"); } strcpy(dic_bin,vars->optarg); remove_extension(dic_bin,dic_inf); strcat(dic_inf,".inf"); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_BuildKrMwuDic[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } if (output[0]=='\0') { fatal_error("Output file must be specified\n"); } if (inflection_dir[0]=='\0') { fatal_error("Inflection directory must be specified\n"); } if (alphabet[0]=='\0') { fatal_error("Alphabet file must be specified\n"); } if (dic_bin[0]=='\0') { fatal_error("Binary dictionary must be specified\n"); } U_FILE* delas=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (delas==NULL) { fatal_error("Cannot open %s\n",argv[vars->optind]); } U_FILE* grf=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,output,U_WRITE); if (grf==NULL) { fatal_error("Cannot open %s\n",output); } Alphabet* alph=load_alphabet(alphabet,1); if (alph==NULL) { fatal_error("Cannot open alphabet file %s\n",alphabet); } Korean* korean=new Korean(alph); MultiFlex_ctx* multiFlex_ctx = (MultiFlex_ctx*)malloc(sizeof(MultiFlex_ctx)); if (multiFlex_ctx==NULL) { fatal_alloc_error("main_BuildKrMwuDic"); } strcpy(multiFlex_ctx->inflection_directory,inflection_dir); if (init_transducer_tree(multiFlex_ctx)) { fatal_error("init_transducer_tree error\n"); } struct l_morpho_t* pL_MORPHO=init_langage_morph(); if (pL_MORPHO == NULL) { fatal_error("init_langage_morph error\n"); } unsigned char* bin=load_BIN_file(dic_bin); struct INF_codes* inf=load_INF_file(dic_inf); create_mwu_dictionary(delas,grf,multiFlex_ctx,korean,pL_MORPHO,encoding_output, bom_output,mask_encoding_compatibility_input,bin,inf); free(bin); free_INF_codes(inf); u_fclose(delas); u_fclose(grf); free_alphabet(alph); delete korean; free_transducer_tree(multiFlex_ctx); for (int count_free_fst2=0;count_free_fst2<multiFlex_ctx->n_fst2;count_free_fst2++) { free_abstract_Fst2(multiFlex_ctx->fst2[count_free_fst2],&(multiFlex_ctx->fst2_free[count_free_fst2])); multiFlex_ctx->fst2[count_free_fst2]=NULL; } free_language_morpho(pL_MORPHO); free(multiFlex_ctx); free_OptVars(vars); u_printf("Done.\n"); return 0; }
/** * The same than main, but no call to setBufferMode. */ int main_BuildKrMwuDic(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } int val,index=-1; char output[FILENAME_MAX]=""; char inflection_dir[FILENAME_MAX]=""; char alphabet[FILENAME_MAX]=""; char dic_bin[FILENAME_MAX]=""; char dic_inf[FILENAME_MAX]=""; // default policy is to compile only out of date graphs GraphRecompilationPolicy graph_recompilation_policy = ONLY_OUT_OF_DATE; VersatileEncodingConfig vec=VEC_DEFAULT; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_BuildKrMwuDic,lopts_BuildKrMwuDic,&index))) { switch(val) { case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output file name\n"); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'd': if (options.vars()->optarg[0]=='\0') { error("Empty inflection directory\n"); return USAGE_ERROR_CODE; } strcpy(inflection_dir,options.vars()->optarg); break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'b': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty binary dictionary name\n"); return USAGE_ERROR_CODE; } strcpy(dic_bin,options.vars()->optarg); remove_extension(dic_bin,dic_inf); strcat(dic_inf,".inf"); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case 'f': graph_recompilation_policy = ALWAYS_RECOMPILE; break; case 'n': graph_recompilation_policy = NEVER_RECOMPILE; break; case 't': graph_recompilation_policy = ONLY_OUT_OF_DATE; break; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_BuildKrMwuDic[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (output[0]=='\0') { error("Output file must be specified\n"); return USAGE_ERROR_CODE; } if (inflection_dir[0]=='\0') { error("Inflection directory must be specified\n"); return USAGE_ERROR_CODE; } if (alphabet[0]=='\0') { error("Alphabet file must be specified\n"); return USAGE_ERROR_CODE; } if (dic_bin[0]=='\0') { error("Binary dictionary must be specified\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } U_FILE* delas=u_fopen(&vec,argv[options.vars()->optind],U_READ); if (delas==NULL) { error("Cannot open %s\n",argv[options.vars()->optind]); return DEFAULT_ERROR_CODE; } U_FILE* grf=u_fopen(&vec,output,U_WRITE); if (grf==NULL) { error("Cannot open %s\n",output); u_fclose(delas); return DEFAULT_ERROR_CODE; } Alphabet* alph=load_alphabet(&vec,alphabet,1); if (alph==NULL) { u_fclose(grf); u_fclose(delas); error("Cannot open alphabet file %s\n",alphabet); return DEFAULT_ERROR_CODE; } Korean* korean=new Korean(alph); MultiFlex_ctx* multiFlex_ctx=new_MultiFlex_ctx(inflection_dir, NULL, NULL, &vec, korean, NULL, NULL, graph_recompilation_policy); Dictionary* d=new_Dictionary(&vec,dic_bin,dic_inf); create_mwu_dictionary(delas,grf,multiFlex_ctx,d); free_Dictionary(d); u_fclose(delas); u_fclose(grf); free_alphabet(alph); delete korean; for (int count_free_fst2=0;count_free_fst2<multiFlex_ctx->n_fst2;count_free_fst2++) { free_abstract_Fst2(multiFlex_ctx->fst2[count_free_fst2],&(multiFlex_ctx->fst2_free[count_free_fst2])); multiFlex_ctx->fst2[count_free_fst2]=NULL; } free_MultiFlex_ctx(multiFlex_ctx); u_printf("Done.\n"); return SUCCESS_RETURN_CODE; }