/** * Get a const void* pointer to a raw Unitex file content (from system filesystem or filespace) * *buffer will receive the pointer with file content * *size_file will receive the file size * *umf will receive a handle needed for releasing */ UNITEX_FUNC void UNITEX_CALL GetUnitexFileReadBuffer(const char*name,UNITEXFILEMAPPED** umf, const void**buffer,size_t *size_file) { *buffer = NULL; *size_file = 0; ABSTRACTMAPFILE*amf = af_open_mapfile(name,MAPFILE_OPTION_READ,0); if (amf != NULL) { *size_file = af_get_mapfile_size(amf); *buffer = af_get_mapfile_pointer(amf, 0, *size_file); } *umf=(UNITEXFILEMAPPED*)amf; }
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err, const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy, Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input, const char* dynamicDir,TokenizationPolicy tokenization_policy, SpacePolicy space_policy,int search_limit,const char* morpho_dic_list, AmbiguousOutputPolicy ambiguous_output_policy, VariableErrorPolicy variable_error_policy,int protect_dic_chars, int is_korean,int max_count_call,int max_count_call_warning, char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) { U_FILE* out; U_FILE* info; struct locate_parameters* p=new_locate_parameters(); p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0); p->buffer=(int*)af_get_mapfile_pointer(p->text_cod); long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int); p->buffer_size=(int)text_size; p->tilde_negation_operator=tilde_negation_operator; p->useLocateCache=useLocateCache; if (max_count_call == -1) { max_count_call = (int)text_size; } if (max_count_call_warning == -1) { max_count_call_warning = (int)text_size; } p->match_policy=match_policy; p->tokenization_policy=tokenization_policy; p->space_policy=space_policy; p->output_policy=output_policy; p->search_limit=search_limit; p->ambiguous_output_policy=ambiguous_output_policy; p->variable_error_policy=variable_error_policy; p->protect_dic_chars=protect_dic_chars; p->mask_encoding_compatibility_input = mask_encoding_compatibility_input; p->max_count_call = max_count_call; p->max_count_call_warning = max_count_call_warning; p->token_filename = tokens; char concord[FILENAME_MAX]; char concord_info[FILENAME_MAX]; strcpy(concord,dynamicDir); strcat(concord,"concord.ind"); strcpy(concord_info,dynamicDir); strcat(concord_info,"concord.n"); char morpho_bin[FILENAME_MAX]; strcpy(morpho_bin,dynamicDir); strcat(morpho_bin,"morpho.bin"); if (arabic_rules!=NULL && arabic_rules[0]!='\0') { load_arabic_typo_rules(arabic_rules,&(p->arabic)); } out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE); if (out==NULL) { error("Cannot write %s\n",concord); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); u_fclose(out); return 0; } info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE); if (info==NULL) { error("Cannot write %s\n",concord_info); } switch(output_policy) { case IGNORE_OUTPUTS: u_fprintf(out,"#I\n"); break; case MERGE_OUTPUTS: u_fprintf(out,"#M\n"); break; case REPLACE_OUTPUTS: u_fprintf(out,"#R\n"); break; } if (alphabet!=NULL && alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); p->alphabet=load_alphabet(alphabet,is_korean); if (p->alphabet==NULL) { error("Cannot load alphabet file %s\n",alphabet); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } } struct string_hash* semantic_codes=new_string_hash(); extract_semantic_codes(dlf,semantic_codes); extract_semantic_codes(dlc,semantic_codes); if (is_cancelling_requested() != 0) { error("user cancel request.\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } u_printf("Loading fst2...\n"); struct FST2_free_info fst2load_free; Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free); if (fst2load==NULL) { error("Cannot load grammar %s\n",fst2_name); free_alphabet(p->alphabet); free_string_hash(semantic_codes); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered); p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator); free_abstract_Fst2(fst2load,&fst2load_free); if (is_cancelling_requested() != 0) { error("User cancel request..\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); free_stack_unichar(p->stack); free_locate_parameters(p); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } p->tags=p->fst2->tags; #ifdef TRE_WCHAR p->filters=new_FilterSet(p->fst2,p->alphabet); if (p->filters==NULL) { error("Cannot compile filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_stack_unichar(p->stack); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif u_printf("Loading token list...\n"); int n_text_tokens=0; p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens); if (p->tokens==NULL) { error("Cannot load token list %s\n",tokens); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_Fst2(p->fst2,locate_abstract_allocator); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator; p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator); memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache)); if (p->match_cache==NULL) { fatal_alloc_error("locate_pattern"); } #ifdef TRE_WCHAR p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens); if (p->filter_match_index==NULL) { error("Cannot optimize filter(s)\n"); free_alphabet(p->alphabet); free_string_hash(semantic_codes); free_string_hash(p->tokens); close_abstract_allocator(locate_abstract_allocator); free_locate_parameters(p); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); return 0; } #endif if (allow_trace!=0) { open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace); } extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator); u_printf("Loading morphological dictionaries...\n"); load_morphological_dictionaries(morpho_dic_list,p,morpho_bin); extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator); p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char)); if (p->token_control==NULL) { fatal_alloc_error("locate_pattern"); } p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*)); if (p->matching_patterns==NULL) { fatal_alloc_error("locate_pattern"); } for (int i=0; i<n_text_tokens; i++) { p->token_control[i]=0; p->matching_patterns[i]=NULL; } compute_token_controls(p->alphabet,err,p); int number_of_patterns,is_DIC,is_CDIC,is_SDIC; p->pattern_tree_root=new_pattern_node(locate_abstract_allocator); u_printf("Computing fst2 tags...\n"); process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator); p->current_compound_pattern=number_of_patterns; p->DLC_tree=new_DLC_tree(p->tokens->size); struct lemma_node* root=new_lemma_node(); u_printf("Loading dlf...\n"); load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); u_printf("Loading dlc...\n"); load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p); /* We look if tag tokens like "{today,.ADV}" verify some patterns */ check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator); u_printf("Optimizing fst2 pattern tags...\n"); optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator); u_printf("Optimizing compound word dictionary...\n"); optimize_DLC(p->DLC_tree); free_string_hash(semantic_codes); int nb_input_variable=0; p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable); p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables); Abstract_allocator locate_recycle_abstract_allocator=NULL; locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle", AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject, get_prefered_allocator_item_size_for_nb_variable(nb_input_variable)); u_printf("Optimizing fst2...\n"); p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator); if (is_korean) { p->korean=new Korean(p->alphabet); p->jamo_tags=create_jamo_tags(p->korean,p->tokens); } p->failfast=new_bit_array(n_text_tokens,ONE_BIT); u_printf("Working...\n"); p->prv_alloc=locate_work_abstract_allocator; p->prv_alloc_recycle=locate_recycle_abstract_allocator; launch_locate(out,text_size,info,p); if (allow_trace!=0) { close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace); } free_bit_array(p->failfast); free_Variables(p->input_variables); free_OutputVariables(p->output_variables); af_release_mapfile_pointer(p->text_cod,p->buffer); af_close_mapfile(p->text_cod); if (info!=NULL) u_fclose(info); u_fclose(out); if (p->match_cache!=NULL) { for (int i=0; i<p->tokens->size; i++) { free_LocateCache(p->match_cache[i],locate_work_abstract_allocator); } free_cb(p->match_cache,locate_work_abstract_allocator); } int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1; if (free_abstract_allocator_item) { free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator); } free_stack_unichar(p->stack); /** Too long to free the DLC tree if it is big * free_DLC_tree(p->DLC_tree); */ if (free_abstract_allocator_item) { free_pattern_node(p->pattern_tree_root,locate_abstract_allocator); free_Fst2(p->fst2,locate_abstract_allocator); free_list_int(p->tag_token_list,locate_abstract_allocator); } close_abstract_allocator(locate_abstract_allocator); close_abstract_allocator(locate_recycle_abstract_allocator); locate_recycle_abstract_allocator=locate_abstract_allocator=NULL; /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */ free_alphabet(p->alphabet); if (p->korean!=NULL) { delete p->korean; } if (p->jamo_tags!=NULL) { /* jamo tags must be freed before tokens, because we need to know how * many jamo tags there are, and this number is the number of tokens */ for (int i=0; i<p->tokens->size; i++) { free(p->jamo_tags[i]); } free(p->jamo_tags); } free_string_hash(p->tokens); free_lemma_node(root); free(p->token_control); for (int i=0; i<n_text_tokens; i++) { free_bit_array(p->matching_patterns[i]); } free(p->matching_patterns); #ifdef TRE_WCHAR free_FilterSet(p->filters); free_FilterMatchIndex(p->filter_match_index); #endif for (int i=0; i<p->n_morpho_dics; i++) { free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i])); free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i])); } free(p->morpho_dic_inf); free(p->morpho_dic_inf_free); free(p->morpho_dic_bin); free(p->morpho_dic_bin_free); #if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT)) free_DLC_tree(p->DLC_tree); #endif free_locate_parameters(p); u_printf("Done.\n"); return 1; }
int main_Untokenize(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } char alphabet[FILENAME_MAX]=""; char token_file[FILENAME_MAX]=""; char dynamicSntDir[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; int range_start,range_stop,use_range; int token_step_number=0; range_start=range_stop=use_range=0; char foo=0; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_Untokenize,lopts_Untokenize,&index))) { switch(val) { case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'd': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty snt dir name\n"); return USAGE_ERROR_CODE; } strcpy(dynamicSntDir,options.vars()->optarg); break; case 't': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty token file name\n"); return USAGE_ERROR_CODE; } strcpy(token_file,options.vars()->optarg); break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&token_step_number,&foo) || token_step_number<=0) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid token numbering argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; case 'r': { int param1 = 0; int param2 = 0; int ret_scan = sscanf(options.vars()->optarg,"%d,%d%c",¶m1,¶m2,&foo); if (ret_scan == 2) { range_start = param1; range_stop = param2; use_range=1; if (((range_start < -1)) || (range_stop < -1)) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid stop count argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } } else if (1!=sscanf(options.vars()->optarg,"%d%c",&range_start,&foo) || (range_start < -1)) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid stop count argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } use_range=1; } break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Untokenize[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } char tokens_txt[FILENAME_MAX]; char text_cod[FILENAME_MAX]; char enter_pos[FILENAME_MAX]; if (dynamicSntDir[0]=='\0') { get_snt_path(argv[options.vars()->optind],dynamicSntDir); } strcpy(text_cod,dynamicSntDir); strcat(text_cod,"text.cod"); strcpy(enter_pos,dynamicSntDir); strcat(enter_pos,"enter.pos"); strcpy(tokens_txt,dynamicSntDir); strcat(tokens_txt,"tokens.txt"); Alphabet* alph=NULL; if (alphabet[0]!='\0') { alph=load_alphabet(&vec,alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); return DEFAULT_ERROR_CODE; } } ABSTRACTMAPFILE* af_text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0); if (af_text_cod==NULL) { error("Cannot open file %s\n",text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } ABSTRACTMAPFILE* af_enter_pos=af_open_mapfile(enter_pos,MAPFILE_OPTION_READ,0); if (af_enter_pos==NULL) { error("Cannot open file %s\n",enter_pos); af_close_mapfile(af_text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } U_FILE* text = u_fopen(&vec,argv[options.vars()->optind],U_WRITE); if (text==NULL) { error("Cannot create text file %s\n",argv[options.vars()->optind]); af_close_mapfile(af_enter_pos); af_close_mapfile(af_text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } struct text_tokens* tok=load_text_tokens(&vec,tokens_txt); u_printf("Untokenizing text...\n"); size_t nb_item = af_get_mapfile_size(af_text_cod)/sizeof(int); const int* buf=(const int*)af_get_mapfile_pointer(af_text_cod); size_t nb_item_enter_pos=0; const int* buf_enter=NULL; if (af_enter_pos!=NULL) { buf_enter=(const int*)af_get_mapfile_pointer(af_enter_pos); if (buf_enter!=NULL) { nb_item_enter_pos=af_get_mapfile_size(af_enter_pos)/sizeof(int); } } size_t count_pos=0; for (size_t i=0;i<nb_item;i++) { int is_in_range=1; if ((use_range!=0) && (i<(size_t)range_start)) { is_in_range=0; } if ((use_range!=0) && (range_stop!=0) && (i>(size_t)range_stop)) { is_in_range=0; } int is_newline=0; if (count_pos<nb_item_enter_pos) { if (i==(size_t)(*(buf_enter+count_pos))) { is_newline = 1; count_pos++; } } if (is_in_range!=0) { if (token_step_number != 0) if ((i%token_step_number)==0) u_fprintf(text,"\n\nToken %d : ", (int)i); if (is_newline!=0) { u_fprintf(text,"\n", tok->token[*(buf+i)]); } else { u_fputs(tok->token[*(buf+i)], text); } } } af_release_mapfile_pointer(af_text_cod,buf); af_release_mapfile_pointer(af_enter_pos,buf_enter); af_close_mapfile(af_enter_pos); af_close_mapfile(af_text_cod); free_text_tokens(tok); u_fclose(text); free_alphabet(alph); u_printf("\nDone.\n"); return SUCCESS_RETURN_CODE; }
int main_Extract(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int val,index=-1; char extract_matching_units=1; char text_name[FILENAME_MAX]=""; char concord_ind[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Extract,lopts_Extract,&index,vars))) { switch(val) { case 'y': extract_matching_units=1; break; case 'n': extract_matching_units=0; break; case 'o': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty output file name\n"); } strcpy(output,vars->optarg); break; case 'i': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty concordance file name\n"); } strcpy(concord_ind,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_Extract[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; } index=-1; } if (output[0]=='\0') { fatal_error("You must specify the output text file\n"); } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } strcpy(text_name,argv[vars->optind]); struct snt_files* snt_files=new_snt_files(text_name); ABSTRACTMAPFILE* text=af_open_mapfile(snt_files->text_cod,MAPFILE_OPTION_READ,0); if (text==NULL) { error("Cannot open %s\n",snt_files->text_cod); return 1; } struct text_tokens* tok=load_text_tokens(snt_files->tokens_txt,mask_encoding_compatibility_input); if (tok==NULL) { error("Cannot load token list %s\n",snt_files->tokens_txt); af_close_mapfile(text); return 1; } if (tok->SENTENCE_MARKER==-1) { error("The text does not contain any sentence marker {S}\n"); af_close_mapfile(text); free_text_tokens(tok); return 1; } if (concord_ind[0]=='\0') { char tmp[FILENAME_MAX]; get_extension(text_name,tmp); if (strcmp(tmp,"snt")) { fatal_error("Unable to find the concord.ind file. Please explicit it\n"); } remove_extension(text_name,concord_ind); strcat(concord_ind,"_snt"); strcat(concord_ind,PATH_SEPARATOR_STRING); strcat(concord_ind,"concord.ind"); } U_FILE* concord=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,concord_ind,U_READ); if (concord==NULL) { error("Cannot open concordance %s\n",concord_ind); af_close_mapfile(text); free_text_tokens(tok); return 1; } U_FILE* result=u_fopen_creating_versatile_encoding(encoding_output,bom_output,output,U_WRITE); if (result==NULL) { error("Cannot write output file %s\n",output); af_close_mapfile(text); u_fclose(concord); free_text_tokens(tok); return 1; } free_snt_files(snt_files); extract_units(extract_matching_units,text,tok,concord,result); af_close_mapfile(text); u_fclose(concord); u_fclose(result); free_text_tokens(tok); free_OptVars(vars); u_printf("Done.\n"); return 0; }
/** * The same than main, but no call to setBufferMode. */ int main_Concord(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } int val,index=-1; struct conc_opt* concord_options = new_conc_opt(); char foo; VersatileEncodingConfig vec=VEC_DEFAULT; int ret; char offset_file[FILENAME_MAX]=""; char PRLG[FILENAME_MAX]=""; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_Concord,lopts_Concord,&index))) { switch(val) { case 'f': if (options.vars()->optarg[0]=='\0') { error("Empty font name argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } concord_options->fontname=strdup(options.vars()->optarg); if (concord_options->fontname==NULL) { alloc_error("main_Concord"); free_conc_opt(concord_options); return ALLOC_ERROR_CODE; } break; case 's': if (1!=sscanf(options.vars()->optarg,"%d%c",&(concord_options->fontsize),&foo)) { /* foo is used to check that the font size is not like "45gjh" */ error("Invalid font size argument: %s\n",options.vars()->optarg); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } break; case 'l': ret=sscanf(options.vars()->optarg,"%d%c%c",&(concord_options->left_context),&foo,&foo); if (ret==0 || ret==3 || (ret==2 && foo!='s') || concord_options->left_context<0) { error("Invalid left context argument: %s\n",options.vars()->optarg); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } if (ret==2) { concord_options->left_context_until_eos=1; } break; case 'r': ret=sscanf(options.vars()->optarg,"%d%c%c",&(concord_options->right_context),&foo,&foo); if (ret==0 || ret==3 || (ret==2 && foo!='s') || concord_options->right_context<0) { error("Invalid right context argument: %s\n",options.vars()->optarg); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } if (ret==2) { concord_options->right_context_until_eos=1; } break; case 'L': concord_options->convLFtoCRLF=0; break; case 0: concord_options->sort_mode=TEXT_ORDER; break; case 1: concord_options->sort_mode=LEFT_CENTER; break; case 2: concord_options->sort_mode=LEFT_RIGHT; break; case 3: concord_options->sort_mode=CENTER_LEFT; break; case 4: concord_options->sort_mode=CENTER_RIGHT; break; case 5: concord_options->sort_mode=RIGHT_LEFT; break; case 6: concord_options->sort_mode=RIGHT_CENTER; break; case 7: concord_options->result_mode=DIFF_; break; case 8: concord_options->only_ambiguous=1; break; case 9: { strcpy(PRLG,options.vars()->optarg); char* pos=strchr(PRLG,','); if (pos==NULL || pos==PRLG || *(pos+1)=='\0') { error("Invalid argument for option --PRLG: %s\n",options.vars()->optarg); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } *pos='\0'; strcpy(offset_file,pos+1); break; } case 10: concord_options->only_matches=1; break; case 11: concord_options->result_mode=LEMMATIZE_; break; case 12: concord_options->result_mode=CSV_; break; case 'H': concord_options->result_mode=HTML_; break; case 't': { concord_options->result_mode=TEXT_; if (options.vars()->optarg!=NULL) { strcpy(concord_options->output,options.vars()->optarg); } break; } case 'g': concord_options->result_mode=GLOSSANET_; if (options.vars()->optarg[0]=='\0') { error("Empty glossanet script argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } concord_options->script=strdup(options.vars()->optarg); if (concord_options->script==NULL) { alloc_error("main_Concord"); free_conc_opt(concord_options); return ALLOC_ERROR_CODE; } break; case 'p': concord_options->result_mode=SCRIPT_; if (options.vars()->optarg[0]=='\0') { error("Empty script argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } concord_options->script=strdup(options.vars()->optarg); if (concord_options->script==NULL) { alloc_error("main_Concord"); free_conc_opt(concord_options); return ALLOC_ERROR_CODE; } break; case 'i': concord_options->result_mode=INDEX_; break; case 'u': concord_options->result_mode=UIMA_; if (options.vars()->optarg!=NULL) { strcpy(offset_file,options.vars()->optarg); } concord_options->original_file_offsets=1; break; case 'e': concord_options->result_mode=XML_; if (options.vars()->optarg!=NULL) { strcpy(offset_file, options.vars()->optarg); concord_options->original_file_offsets=1; } break; case 'w': concord_options->result_mode=XML_WITH_HEADER_; if (options.vars()->optarg!=NULL) { strcpy(offset_file, options.vars()->optarg); concord_options->original_file_offsets = 1; } break; case '$': if (options.vars()->optarg[0]=='\0') { error("Empty input_offsets argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } strcpy(concord_options->input_offsets,options.vars()->optarg); break; case '@': if (options.vars()->optarg[0]=='\0') { error("Empty output_offsets argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } strcpy(concord_options->output_offsets,options.vars()->optarg); break; case 'A': concord_options->result_mode=AXIS_; break; case 'x': concord_options->result_mode=XALIGN_; break; case 'm': concord_options->result_mode=MERGE_; if (options.vars()->optarg[0]=='\0') { error("Empty output file name argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } strcpy(concord_options->output,options.vars()->optarg); break; case 'a': if (options.vars()->optarg[0]=='\0') { error("Empty alphabet argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } concord_options->sort_alphabet=strdup(options.vars()->optarg); if (concord_options->sort_alphabet==NULL) { alloc_error("main_Concord"); free_conc_opt(concord_options); return ALLOC_ERROR_CODE; } break; case 'T': concord_options->thai_mode=1; break; case 'd': if (options.vars()->optarg[0]=='\0') { error("Empty snt directory argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } strcpy(concord_options->working_directory,options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); free_conc_opt(concord_options); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Concord[index].name); free_conc_opt(concord_options); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); free_conc_opt(concord_options); return USAGE_ERROR_CODE; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } if (concord_options->fontname==NULL || concord_options->fontsize<=0) { if (concord_options->result_mode==HTML_ || concord_options->result_mode==GLOSSANET_) { error("The specified output mode is an HTML file: you must specify font parameters\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } } if (only_verify_arguments) { // freeing all allocated memory free_conc_opt(concord_options); return SUCCESS_RETURN_CODE; } U_FILE* concor=u_fopen(&vec,argv[options.vars()->optind],U_READ); if (concor==NULL) { error("Cannot open concordance index file %s\n",argv[options.vars()->optind]); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } if (concord_options->working_directory[0]=='\0') { get_path(argv[options.vars()->optind],concord_options->working_directory); } if (concord_options->only_matches) { concord_options->left_context=0; concord_options->right_context=0; } /* We compute the name of the files associated to the text */ struct snt_files* snt_files=new_snt_files_from_path(concord_options->working_directory); ABSTRACTMAPFILE* text=af_open_mapfile(snt_files->text_cod,MAPFILE_OPTION_READ,0); if (text==NULL) { error("Cannot open file %s\n",snt_files->text_cod); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } struct text_tokens* tok=load_text_tokens(&vec,snt_files->tokens_txt); if (tok==NULL) { error("Cannot load text token file %s\n",snt_files->tokens_txt); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } U_FILE* f_enter=u_fopen(BINARY,snt_files->enter_pos,U_READ); int n_enter_char=0; int* enter_pos=NULL; /* New lines are encoded in 'enter.pos' files. Those files will disappear in the future */ if (f_enter==NULL) { error("Cannot open file %s\n",snt_files->enter_pos); } else { long size=get_file_size(f_enter); enter_pos=(int*)malloc(size); if (enter_pos==NULL) { alloc_error("main_Concord"); u_fclose(f_enter); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return ALLOC_ERROR_CODE; } n_enter_char=(int)fread(enter_pos,sizeof(int),size/sizeof(int),f_enter); if (n_enter_char!=(int)(size/sizeof(int))) { error("Read error on enter.pos file in main_Concord\n"); u_fclose(f_enter); free(enter_pos); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } u_fclose(f_enter); } if (concord_options->result_mode==INDEX_ || concord_options->result_mode==UIMA_ || concord_options->result_mode==XML_ || concord_options->result_mode==XML_WITH_HEADER_ || concord_options->result_mode==AXIS_) { /* We force some options for index, uima and axis files */ concord_options->left_context=0; concord_options->right_context=0; concord_options->sort_mode=TEXT_ORDER; } if (concord_options->only_ambiguous && concord_options->result_mode!=LEMMATIZE_) { /* We force text order when displaying only ambiguous outputs */ concord_options->sort_mode=TEXT_ORDER; } if (concord_options->result_mode==HTML_ || concord_options->result_mode==DIFF_ || concord_options->result_mode==LEMMATIZE_) { /* We need the offset file if and only if we have to produce * an html concordance with positions in .snt file */ concord_options->snt_offsets=load_snt_offsets(snt_files->snt_offsets_pos); if (concord_options->snt_offsets==NULL) { error("Cannot read snt offset file %s\n",snt_files->snt_offsets_pos); free(enter_pos); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } } if (offset_file[0]!='\0') { concord_options->uima_offsets=load_uima_offsets(&vec,offset_file); if (concord_options->uima_offsets==NULL) { error("Cannot read offset file %s\n",offset_file); free(enter_pos); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } } if (PRLG[0]!='\0') { concord_options->PRLG_data=load_PRLG_data(&vec,PRLG); if (concord_options->PRLG_data==NULL) { error("Cannot read PRLG file %s\n",PRLG); free(enter_pos); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } } if (concord_options->result_mode==CSV_) { concord_options->sort_mode=TEXT_ORDER; concord_options->only_matches=1; } /* Once we have set all parameters, we call the function that * will actually create the concordance. */ create_concordance(&vec,concor,text,tok,n_enter_char,enter_pos,concord_options); free(enter_pos); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); u_printf("Done.\n"); return SUCCESS_RETURN_CODE; }