Пример #1
0
/**
  * Release buffer returned by GetUnitexFileReadBuffer
  */
UNITEX_FUNC void UNITEX_CALL CloseUnitexFileReadBuffer(UNITEXFILEMAPPED *umf,const void*buffer,size_t size_file)
{
    ABSTRACTMAPFILE*amf=(ABSTRACTMAPFILE*)umf;
    if (amf != NULL)
    {
        af_release_mapfile_pointer(amf,buffer,size_file);
        af_close_mapfile(amf);
    }
}
Пример #2
0
int locate_pattern(const char* text_cod,const char* tokens,const char* fst2_name,const char* dlf,const char* dlc,const char* err,
                   const char* alphabet,MatchPolicy match_policy,OutputPolicy output_policy,
                   Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input,
                   const char* dynamicDir,TokenizationPolicy tokenization_policy,
                   SpacePolicy space_policy,int search_limit,const char* morpho_dic_list,
                   AmbiguousOutputPolicy ambiguous_output_policy,
                   VariableErrorPolicy variable_error_policy,int protect_dic_chars,
                   int is_korean,int max_count_call,int max_count_call_warning,
                   char* arabic_rules,int tilde_negation_operator,int useLocateCache,int allow_trace) {

    U_FILE* out;
    U_FILE* info;
    struct locate_parameters* p=new_locate_parameters();
    p->text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0);
    p->buffer=(int*)af_get_mapfile_pointer(p->text_cod);
    long text_size=(long)af_get_mapfile_size(p->text_cod)/sizeof(int);
    p->buffer_size=(int)text_size;
    p->tilde_negation_operator=tilde_negation_operator;
    p->useLocateCache=useLocateCache;
    if (max_count_call == -1) {
        max_count_call = (int)text_size;
    }
    if (max_count_call_warning == -1) {
        max_count_call_warning = (int)text_size;
    }
    p->match_policy=match_policy;
    p->tokenization_policy=tokenization_policy;
    p->space_policy=space_policy;
    p->output_policy=output_policy;
    p->search_limit=search_limit;
    p->ambiguous_output_policy=ambiguous_output_policy;
    p->variable_error_policy=variable_error_policy;
    p->protect_dic_chars=protect_dic_chars;
    p->mask_encoding_compatibility_input = mask_encoding_compatibility_input;
    p->max_count_call = max_count_call;
    p->max_count_call_warning = max_count_call_warning;
    p->token_filename = tokens;
    char concord[FILENAME_MAX];
    char concord_info[FILENAME_MAX];

    strcpy(concord,dynamicDir);
    strcat(concord,"concord.ind");

    strcpy(concord_info,dynamicDir);
    strcat(concord_info,"concord.n");

    char morpho_bin[FILENAME_MAX];
    strcpy(morpho_bin,dynamicDir);
    strcat(morpho_bin,"morpho.bin");
    if (arabic_rules!=NULL && arabic_rules[0]!='\0') {
        load_arabic_typo_rules(arabic_rules,&(p->arabic));
    }
    out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord,U_WRITE);
    if (out==NULL) {
        error("Cannot write %s\n",concord);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        u_fclose(out);
        return 0;
    }
    info=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,concord_info,U_WRITE);
    if (info==NULL) {
        error("Cannot write %s\n",concord_info);
    }
    switch(output_policy) {
    case IGNORE_OUTPUTS:
        u_fprintf(out,"#I\n");
        break;
    case MERGE_OUTPUTS:
        u_fprintf(out,"#M\n");
        break;
    case REPLACE_OUTPUTS:
        u_fprintf(out,"#R\n");
        break;
    }
    if (alphabet!=NULL && alphabet[0]!='\0') {
        u_printf("Loading alphabet...\n");
        p->alphabet=load_alphabet(alphabet,is_korean);
        if (p->alphabet==NULL) {
            error("Cannot load alphabet file %s\n",alphabet);
            af_release_mapfile_pointer(p->text_cod,p->buffer);
            af_close_mapfile(p->text_cod);
            free_stack_unichar(p->stack);
            free_locate_parameters(p);
            if (info!=NULL) u_fclose(info);
            u_fclose(out);
            return 0;
        }
    }
    struct string_hash* semantic_codes=new_string_hash();
    extract_semantic_codes(dlf,semantic_codes);
    extract_semantic_codes(dlc,semantic_codes);

    if (is_cancelling_requested() != 0) {
        error("user cancel request.\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    u_printf("Loading fst2...\n");
    struct FST2_free_info fst2load_free;
    Fst2* fst2load=load_abstract_fst2(fst2_name,1,&fst2load_free);
    if (fst2load==NULL) {
        error("Cannot load grammar %s\n",fst2_name);
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    Abstract_allocator locate_abstract_allocator=create_abstract_allocator("locate_pattern",AllocatorCreationFlagAutoFreePrefered);


    p->fst2=new_Fst2_clone(fst2load,locate_abstract_allocator);
    free_abstract_Fst2(fst2load,&fst2load_free);

    if (is_cancelling_requested() != 0) {
        error("User cancel request..\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }

    p->tags=p->fst2->tags;
#ifdef TRE_WCHAR
    p->filters=new_FilterSet(p->fst2,p->alphabet);
    if (p->filters==NULL) {
        error("Cannot compile filter(s)\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        free_stack_unichar(p->stack);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
#endif
    u_printf("Loading token list...\n");
    int n_text_tokens=0;

    p->tokens=load_text_tokens_hash(tokens,mask_encoding_compatibility_input,&(p->SENTENCE),&(p->STOP),&n_text_tokens);
    if (p->tokens==NULL) {
        error("Cannot load token list %s\n",tokens);
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_Fst2(p->fst2,locate_abstract_allocator);
        close_abstract_allocator(locate_abstract_allocator);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
    Abstract_allocator locate_work_abstract_allocator = locate_abstract_allocator;

    p->match_cache=(LocateCache*)malloc_cb(p->tokens->size * sizeof(LocateCache),locate_work_abstract_allocator);
    memset(p->match_cache,0,p->tokens->size * sizeof(LocateCache));
    if (p->match_cache==NULL) {
        fatal_alloc_error("locate_pattern");
    }

#ifdef TRE_WCHAR
    p->filter_match_index=new_FilterMatchIndex(p->filters,p->tokens);
    if (p->filter_match_index==NULL) {
        error("Cannot optimize filter(s)\n");
        free_alphabet(p->alphabet);
        free_string_hash(semantic_codes);
        free_string_hash(p->tokens);
        close_abstract_allocator(locate_abstract_allocator);
        free_locate_parameters(p);
        af_release_mapfile_pointer(p->text_cod,p->buffer);
        af_close_mapfile(p->text_cod);
        if (info!=NULL) u_fclose(info);
        u_fclose(out);
        return 0;
    }
#endif

    if (allow_trace!=0) {
        open_locate_trace(p,&p->fnc_locate_trace_step,&p->private_param_locate_trace);
    }
    extract_semantic_codes_from_tokens(p->tokens,semantic_codes,locate_abstract_allocator);
    u_printf("Loading morphological dictionaries...\n");
    load_morphological_dictionaries(morpho_dic_list,p,morpho_bin);
    extract_semantic_codes_from_morpho_dics(p->morpho_dic_inf,p->n_morpho_dics,semantic_codes,locate_abstract_allocator);
    p->token_control=(unsigned char*)malloc(n_text_tokens*sizeof(unsigned char));
    if (p->token_control==NULL) {
        fatal_alloc_error("locate_pattern");
    }
    p->matching_patterns=(struct bit_array**)malloc(n_text_tokens*sizeof(struct bit_array*));
    if (p->matching_patterns==NULL) {
        fatal_alloc_error("locate_pattern");
    }
    for (int i=0; i<n_text_tokens; i++) {
        p->token_control[i]=0;
        p->matching_patterns[i]=NULL;
    }
    compute_token_controls(p->alphabet,err,p);
    int number_of_patterns,is_DIC,is_CDIC,is_SDIC;
    p->pattern_tree_root=new_pattern_node(locate_abstract_allocator);
    u_printf("Computing fst2 tags...\n");
    process_tags(&number_of_patterns,semantic_codes,&is_DIC,&is_CDIC,&is_SDIC,p,locate_abstract_allocator);
    p->current_compound_pattern=number_of_patterns;
    p->DLC_tree=new_DLC_tree(p->tokens->size);
    struct lemma_node* root=new_lemma_node();
    u_printf("Loading dlf...\n");
    load_dic_for_locate(dlf,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p);
    u_printf("Loading dlc...\n");
    load_dic_for_locate(dlc,mask_encoding_compatibility_input,p->alphabet,number_of_patterns,is_DIC,is_CDIC,root,p);
    /* We look if tag tokens like "{today,.ADV}" verify some patterns */
    check_patterns_for_tag_tokens(p->alphabet,number_of_patterns,root,p,locate_abstract_allocator);
    u_printf("Optimizing fst2 pattern tags...\n");
    optimize_pattern_tags(p->alphabet,root,p,locate_abstract_allocator);
    u_printf("Optimizing compound word dictionary...\n");
    optimize_DLC(p->DLC_tree);
    free_string_hash(semantic_codes);
    int nb_input_variable=0;
    p->input_variables=new_Variables(p->fst2->input_variables,&nb_input_variable);
    p->output_variables=new_OutputVariables(p->fst2->output_variables,&p->nb_output_variables);


    Abstract_allocator locate_recycle_abstract_allocator=NULL;
    locate_recycle_abstract_allocator=create_abstract_allocator("locate_pattern_recycle",
                                      AllocatorFreeOnlyAtAllocatorDelete|AllocatorTipOftenRecycledObject,
                                      get_prefered_allocator_item_size_for_nb_variable(nb_input_variable));

    u_printf("Optimizing fst2...\n");
    p->optimized_states=build_optimized_fst2_states(p->input_variables,p->output_variables,p->fst2,locate_abstract_allocator);
    if (is_korean) {
        p->korean=new Korean(p->alphabet);
        p->jamo_tags=create_jamo_tags(p->korean,p->tokens);
    }
    p->failfast=new_bit_array(n_text_tokens,ONE_BIT);

    u_printf("Working...\n");
    p->prv_alloc=locate_work_abstract_allocator;
    p->prv_alloc_recycle=locate_recycle_abstract_allocator;
    launch_locate(out,text_size,info,p);
    if (allow_trace!=0) {
        close_locate_trace(p,p->fnc_locate_trace_step,p->private_param_locate_trace);
    }
    free_bit_array(p->failfast);
    free_Variables(p->input_variables);
    free_OutputVariables(p->output_variables);
    af_release_mapfile_pointer(p->text_cod,p->buffer);
    af_close_mapfile(p->text_cod);
    if (info!=NULL) u_fclose(info);
    u_fclose(out);

    if (p->match_cache!=NULL) {
        for (int i=0; i<p->tokens->size; i++) {
            free_LocateCache(p->match_cache[i],locate_work_abstract_allocator);
        }
        free_cb(p->match_cache,locate_work_abstract_allocator);
    }
    int free_abstract_allocator_item=(get_allocator_cb_flag(locate_abstract_allocator) & AllocatorGetFlagAutoFreePresent) ? 0 : 1;

    if (free_abstract_allocator_item) {
        free_optimized_states(p->optimized_states,p->fst2->number_of_states,locate_abstract_allocator);
    }
    free_stack_unichar(p->stack);
    /** Too long to free the DLC tree if it is big
     * free_DLC_tree(p->DLC_tree);
     */
    if (free_abstract_allocator_item) {
        free_pattern_node(p->pattern_tree_root,locate_abstract_allocator);
        free_Fst2(p->fst2,locate_abstract_allocator);
        free_list_int(p->tag_token_list,locate_abstract_allocator);
    }
    close_abstract_allocator(locate_abstract_allocator);
    close_abstract_allocator(locate_recycle_abstract_allocator);
    locate_recycle_abstract_allocator=locate_abstract_allocator=NULL;

    /* We don't free 'parameters->tags' because it was just a link on 'parameters->fst2->tags' */
    free_alphabet(p->alphabet);
    if (p->korean!=NULL) {
        delete p->korean;
    }
    if (p->jamo_tags!=NULL) {
        /* jamo tags must be freed before tokens, because we need to know how
         * many jamo tags there are, and this number is the number of tokens */
        for (int i=0; i<p->tokens->size; i++) {
            free(p->jamo_tags[i]);
        }
        free(p->jamo_tags);
    }
    free_string_hash(p->tokens);
    free_lemma_node(root);
    free(p->token_control);
    for (int i=0; i<n_text_tokens; i++) {
        free_bit_array(p->matching_patterns[i]);
    }
    free(p->matching_patterns);
#ifdef TRE_WCHAR
    free_FilterSet(p->filters);
    free_FilterMatchIndex(p->filter_match_index);
#endif
    for (int i=0; i<p->n_morpho_dics; i++) {
        free_abstract_INF(p->morpho_dic_inf[i],&(p->morpho_dic_inf_free[i]));
        free_abstract_BIN(p->morpho_dic_bin[i],&(p->morpho_dic_bin_free[i]));
    }
    free(p->morpho_dic_inf);
    free(p->morpho_dic_inf_free);
    free(p->morpho_dic_bin);
    free(p->morpho_dic_bin_free);
#if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT))
    free_DLC_tree(p->DLC_tree);
#endif
    free_locate_parameters(p);
    u_printf("Done.\n");
    return 1;
}
Пример #3
0
int main_Untokenize(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

char alphabet[FILENAME_MAX]="";
char token_file[FILENAME_MAX]="";
char dynamicSntDir[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
int range_start,range_stop,use_range;
int token_step_number=0;
range_start=range_stop=use_range=0;
char foo=0;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_Untokenize,lopts_Untokenize,&index))) {
   switch(val) {
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                   error("You must specify a non empty snt dir name\n");
                   return USAGE_ERROR_CODE;
                }
                strcpy(dynamicSntDir,options.vars()->optarg);
                break;
   case 't': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty token file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(token_file,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;

   case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&token_step_number,&foo) || token_step_number<=0) {
                /* foo is used to check that the search limit is not like "45gjh" */
                error("Invalid token numbering argument: %s\n",options.vars()->optarg);
                return USAGE_ERROR_CODE;
             }
             break;
   case 'r': {
                int param1 = 0;
                int param2 = 0;
                int ret_scan = sscanf(options.vars()->optarg,"%d,%d%c",&param1,&param2,&foo);
                if (ret_scan == 2) {
                    range_start = param1;
                    range_stop  = param2;
                    use_range=1;
                    if (((range_start < -1)) || (range_stop < -1)) {
                        /* foo is used to check that the search limit is not like "45gjh" */
                        error("Invalid stop count argument: %s\n",options.vars()->optarg);
                        return USAGE_ERROR_CODE;
                    }
                }
                else
                    if (1!=sscanf(options.vars()->optarg,"%d%c",&range_start,&foo) || (range_start < -1)) {
                        /* foo is used to check that the search limit is not like "45gjh" */
                        error("Invalid stop count argument: %s\n",options.vars()->optarg);
                        return USAGE_ERROR_CODE;
                    }
                    use_range=1;
             }
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Untokenize[index].name);
             return USAGE_ERROR_CODE;            
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char tokens_txt[FILENAME_MAX];
char text_cod[FILENAME_MAX];
char enter_pos[FILENAME_MAX];

if (dynamicSntDir[0]=='\0') {
    get_snt_path(argv[options.vars()->optind],dynamicSntDir);
}

strcpy(text_cod,dynamicSntDir);
strcat(text_cod,"text.cod");
strcpy(enter_pos,dynamicSntDir);
strcat(enter_pos,"enter.pos");
strcpy(tokens_txt,dynamicSntDir);
strcat(tokens_txt,"tokens.txt");

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   alph=load_alphabet(&vec,alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      return DEFAULT_ERROR_CODE;
   }
}

ABSTRACTMAPFILE* af_text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0);
if (af_text_cod==NULL) {
  error("Cannot open file %s\n",text_cod);
  free_alphabet(alph);
  return DEFAULT_ERROR_CODE;
}

ABSTRACTMAPFILE* af_enter_pos=af_open_mapfile(enter_pos,MAPFILE_OPTION_READ,0);
if (af_enter_pos==NULL) {
  error("Cannot open file %s\n",enter_pos);
  af_close_mapfile(af_text_cod);
  free_alphabet(alph);
  return DEFAULT_ERROR_CODE;
}

U_FILE* text = u_fopen(&vec,argv[options.vars()->optind],U_WRITE);
if (text==NULL) {
  error("Cannot create text file %s\n",argv[options.vars()->optind]);
  af_close_mapfile(af_enter_pos);
  af_close_mapfile(af_text_cod);
  free_alphabet(alph);
  return DEFAULT_ERROR_CODE;
}

struct text_tokens* tok=load_text_tokens(&vec,tokens_txt);
u_printf("Untokenizing text...\n");
size_t nb_item = af_get_mapfile_size(af_text_cod)/sizeof(int);
const int* buf=(const int*)af_get_mapfile_pointer(af_text_cod);

size_t nb_item_enter_pos=0;
const int* buf_enter=NULL;

if (af_enter_pos!=NULL) {
    buf_enter=(const int*)af_get_mapfile_pointer(af_enter_pos);
    if (buf_enter!=NULL) {
        nb_item_enter_pos=af_get_mapfile_size(af_enter_pos)/sizeof(int);
    }
}

size_t count_pos=0;
for (size_t i=0;i<nb_item;i++) {
    int is_in_range=1;
    if ((use_range!=0) && (i<(size_t)range_start)) {
        is_in_range=0;
    }
    if ((use_range!=0) && (range_stop!=0) && (i>(size_t)range_stop)) {
        is_in_range=0;
    }
    int is_newline=0;
    if (count_pos<nb_item_enter_pos) {
        if (i==(size_t)(*(buf_enter+count_pos))) {
            is_newline = 1;
            count_pos++;
        }
    }

    if (is_in_range!=0) {
        if (token_step_number != 0)
            if ((i%token_step_number)==0)
                u_fprintf(text,"\n\nToken %d : ", (int)i);

        if (is_newline!=0) {
            u_fprintf(text,"\n", tok->token[*(buf+i)]);
        }
        else {
			u_fputs(tok->token[*(buf+i)], text);
        }
    }
}

af_release_mapfile_pointer(af_text_cod,buf);
af_release_mapfile_pointer(af_enter_pos,buf_enter);
af_close_mapfile(af_enter_pos);
af_close_mapfile(af_text_cod);
free_text_tokens(tok);
u_fclose(text);
free_alphabet(alph);

u_printf("\nDone.\n");
return SUCCESS_RETURN_CODE;
}
Пример #4
0
int main_Extract(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}


int val,index=-1;
char extract_matching_units=1;
char text_name[FILENAME_MAX]="";
char concord_ind[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Extract,lopts_Extract,&index,vars))) {
   switch(val) {
   case 'y': extract_matching_units=1; break;
   case 'n': extract_matching_units=0; break;
   case 'o': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty output file name\n");
             }
             strcpy(output,vars->optarg);
             break;
   case 'i': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty concordance file name\n");
             }
             strcpy(concord_ind,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_Extract[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   }
   index=-1;
}

if (output[0]=='\0') {
   fatal_error("You must specify the output text file\n");
}
if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}
strcpy(text_name,argv[vars->optind]);

struct snt_files* snt_files=new_snt_files(text_name);
ABSTRACTMAPFILE* text=af_open_mapfile(snt_files->text_cod,MAPFILE_OPTION_READ,0);
if (text==NULL) {
   error("Cannot open %s\n",snt_files->text_cod);
   return 1;
}
struct text_tokens* tok=load_text_tokens(snt_files->tokens_txt,mask_encoding_compatibility_input);
if (tok==NULL) {
   error("Cannot load token list %s\n",snt_files->tokens_txt);
   af_close_mapfile(text);
   return 1;
}
if (tok->SENTENCE_MARKER==-1) {
   error("The text does not contain any sentence marker {S}\n");
   af_close_mapfile(text);
   free_text_tokens(tok);
   return 1;
}

if (concord_ind[0]=='\0') {
   char tmp[FILENAME_MAX];
   get_extension(text_name,tmp);
   if (strcmp(tmp,"snt")) {
      fatal_error("Unable to find the concord.ind file. Please explicit it\n");
   }
   remove_extension(text_name,concord_ind);
   strcat(concord_ind,"_snt");
   strcat(concord_ind,PATH_SEPARATOR_STRING);
   strcat(concord_ind,"concord.ind");
}
U_FILE* concord=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,concord_ind,U_READ);
if (concord==NULL) {
   error("Cannot open concordance %s\n",concord_ind);
   af_close_mapfile(text);
   free_text_tokens(tok);
   return 1;
}
U_FILE* result=u_fopen_creating_versatile_encoding(encoding_output,bom_output,output,U_WRITE);
if (result==NULL) {
   error("Cannot write output file %s\n",output);
   af_close_mapfile(text);
   u_fclose(concord);
   free_text_tokens(tok);
   return 1;
}
free_snt_files(snt_files);
extract_units(extract_matching_units,text,tok,concord,result);
af_close_mapfile(text);
u_fclose(concord);
u_fclose(result);
free_text_tokens(tok);
free_OptVars(vars);
u_printf("Done.\n");
return 0;
}
Пример #5
0
/**
 * The same than main, but no call to setBufferMode.
 */
int main_Concord(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int val,index=-1;
struct conc_opt* concord_options = new_conc_opt();
char foo;
VersatileEncodingConfig vec=VEC_DEFAULT;
int ret;
char offset_file[FILENAME_MAX]="";
char PRLG[FILENAME_MAX]="";
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_Concord,lopts_Concord,&index))) {
   switch(val) {
   case 'f': if (options.vars()->optarg[0]=='\0') {
                error("Empty font name argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             concord_options->fontname=strdup(options.vars()->optarg);
             if (concord_options->fontname==NULL) {
                alloc_error("main_Concord");
                free_conc_opt(concord_options);
                return ALLOC_ERROR_CODE;
             }
             break;
   case 's': if (1!=sscanf(options.vars()->optarg,"%d%c",&(concord_options->fontsize),&foo)) {
                /* foo is used to check that the font size is not like "45gjh" */
                error("Invalid font size argument: %s\n",options.vars()->optarg);
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             break;
   case 'l': ret=sscanf(options.vars()->optarg,"%d%c%c",&(concord_options->left_context),&foo,&foo);
             if (ret==0 || ret==3 || (ret==2 && foo!='s') || concord_options->left_context<0) {
                error("Invalid left context argument: %s\n",options.vars()->optarg);
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;                
             }
             if (ret==2) {
                concord_options->left_context_until_eos=1;
             }
             break;
   case 'r': ret=sscanf(options.vars()->optarg,"%d%c%c",&(concord_options->right_context),&foo,&foo);
             if (ret==0 || ret==3 || (ret==2 && foo!='s') || concord_options->right_context<0) {
                error("Invalid right context argument: %s\n",options.vars()->optarg);
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;                
             }
             if (ret==2) {
                concord_options->right_context_until_eos=1;
             }
             break;
   case 'L': concord_options->convLFtoCRLF=0; break;
   case 0: concord_options->sort_mode=TEXT_ORDER; break;
   case 1: concord_options->sort_mode=LEFT_CENTER; break;
   case 2: concord_options->sort_mode=LEFT_RIGHT; break;
   case 3: concord_options->sort_mode=CENTER_LEFT; break;
   case 4: concord_options->sort_mode=CENTER_RIGHT; break;
   case 5: concord_options->sort_mode=RIGHT_LEFT; break;
   case 6: concord_options->sort_mode=RIGHT_CENTER; break;
   case 7: concord_options->result_mode=DIFF_; break;
   case 8: concord_options->only_ambiguous=1; break;
   case 9: {
     strcpy(PRLG,options.vars()->optarg);
     char* pos=strchr(PRLG,',');
     if (pos==NULL || pos==PRLG || *(pos+1)=='\0') {
       error("Invalid argument for option --PRLG: %s\n",options.vars()->optarg);
       free_conc_opt(concord_options);
       return USAGE_ERROR_CODE;
     }
     *pos='\0';
     strcpy(offset_file,pos+1);
     break;
   }
   case 10: concord_options->only_matches=1; break;
   case 11: concord_options->result_mode=LEMMATIZE_; break;
   case 12: concord_options->result_mode=CSV_; break;
   case 'H': concord_options->result_mode=HTML_; break;
   case 't': {
     concord_options->result_mode=TEXT_;
     if (options.vars()->optarg!=NULL) {
       strcpy(concord_options->output,options.vars()->optarg);
     }
     break;
   }
   case 'g': concord_options->result_mode=GLOSSANET_;
             if (options.vars()->optarg[0]=='\0') {
                error("Empty glossanet script argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;                   
             }
             concord_options->script=strdup(options.vars()->optarg);
             if (concord_options->script==NULL) {
                alloc_error("main_Concord");
                free_conc_opt(concord_options);
                return ALLOC_ERROR_CODE;                   
             }
             break;
   case 'p': concord_options->result_mode=SCRIPT_;
             if (options.vars()->optarg[0]=='\0') {
                error("Empty script argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;                
             }
             concord_options->script=strdup(options.vars()->optarg);
             if (concord_options->script==NULL) {
                alloc_error("main_Concord");
                free_conc_opt(concord_options);
                return ALLOC_ERROR_CODE;                 
             }
             break;
   case 'i': concord_options->result_mode=INDEX_; break;
   case 'u': concord_options->result_mode=UIMA_;
             if (options.vars()->optarg!=NULL) {
                strcpy(offset_file,options.vars()->optarg);
             }
             concord_options->original_file_offsets=1;
             break;
   case 'e': concord_options->result_mode=XML_;
             if (options.vars()->optarg!=NULL) {
                strcpy(offset_file, options.vars()->optarg);
                concord_options->original_file_offsets=1;
             }
             break;
   case 'w': concord_options->result_mode=XML_WITH_HEADER_; 
             if (options.vars()->optarg!=NULL) {
                strcpy(offset_file, options.vars()->optarg);
                concord_options->original_file_offsets = 1;
             }
             break;
   case '$': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_offsets argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             strcpy(concord_options->input_offsets,options.vars()->optarg);
             break;
   case '@': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_offsets argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             strcpy(concord_options->output_offsets,options.vars()->optarg);
             break;
   case 'A': concord_options->result_mode=AXIS_; break;
   case 'x': concord_options->result_mode=XALIGN_; break;
   case 'm': concord_options->result_mode=MERGE_;
             if (options.vars()->optarg[0]=='\0') {
                error("Empty output file name argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             strcpy(concord_options->output,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("Empty alphabet argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;                
             }
             concord_options->sort_alphabet=strdup(options.vars()->optarg);
             if (concord_options->sort_alphabet==NULL) {
                alloc_error("main_Concord");
                free_conc_opt(concord_options);
                return ALLOC_ERROR_CODE;            }
             break;
   case 'T': concord_options->thai_mode=1; break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                error("Empty snt directory argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             strcpy(concord_options->working_directory,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;             
   case 'h': usage();
             free_conc_opt(concord_options); 
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Concord[index].name);
             free_conc_opt(concord_options);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free_conc_opt(concord_options);
             return USAGE_ERROR_CODE;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free_conc_opt(concord_options);
   return USAGE_ERROR_CODE;
}
if (concord_options->fontname==NULL || concord_options->fontsize<=0) {
   if (concord_options->result_mode==HTML_ || concord_options->result_mode==GLOSSANET_) {
      error("The specified output mode is an HTML file: you must specify font parameters\n");
      free_conc_opt(concord_options);
      return USAGE_ERROR_CODE;      
   }
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free_conc_opt(concord_options);
  return SUCCESS_RETURN_CODE;
}

U_FILE* concor=u_fopen(&vec,argv[options.vars()->optind],U_READ);
if (concor==NULL) {
   error("Cannot open concordance index file %s\n",argv[options.vars()->optind]);
   free_conc_opt(concord_options);
   return DEFAULT_ERROR_CODE;   
}

if (concord_options->working_directory[0]=='\0') {
   get_path(argv[options.vars()->optind],concord_options->working_directory);
}
if (concord_options->only_matches) {
  concord_options->left_context=0;
  concord_options->right_context=0;
}
/* We compute the name of the files associated to the text */
struct snt_files* snt_files=new_snt_files_from_path(concord_options->working_directory);
ABSTRACTMAPFILE* text=af_open_mapfile(snt_files->text_cod,MAPFILE_OPTION_READ,0);
if (text==NULL) {
  error("Cannot open file %s\n",snt_files->text_cod);
  free_snt_files(snt_files);
  u_fclose(concor);
  free_conc_opt(concord_options);
  return DEFAULT_ERROR_CODE;
}
struct text_tokens* tok=load_text_tokens(&vec,snt_files->tokens_txt);
if (tok==NULL) {
  error("Cannot load text token file %s\n",snt_files->tokens_txt);
  af_close_mapfile(text);
  free_snt_files(snt_files);
  u_fclose(concor);
  free_conc_opt(concord_options);
  return DEFAULT_ERROR_CODE;
}

U_FILE* f_enter=u_fopen(BINARY,snt_files->enter_pos,U_READ);
int n_enter_char=0;
int* enter_pos=NULL;
/* New lines are encoded in 'enter.pos' files. Those files will disappear in the future */
if (f_enter==NULL) {
  error("Cannot open file %s\n",snt_files->enter_pos);
}
else {
  long size=get_file_size(f_enter);
  enter_pos=(int*)malloc(size);
  if (enter_pos==NULL) {
    alloc_error("main_Concord");
    u_fclose(f_enter);
    free_text_tokens(tok);
    af_close_mapfile(text);
    free_snt_files(snt_files);
    u_fclose(concor);
    free_conc_opt(concord_options);
    return ALLOC_ERROR_CODE;     
  }
  n_enter_char=(int)fread(enter_pos,sizeof(int),size/sizeof(int),f_enter);
  if (n_enter_char!=(int)(size/sizeof(int))) {
    error("Read error on enter.pos file in main_Concord\n");
    u_fclose(f_enter);
    free(enter_pos);
    free_text_tokens(tok);
    af_close_mapfile(text);
    free_snt_files(snt_files);
    u_fclose(concor);
    free_conc_opt(concord_options);
    return DEFAULT_ERROR_CODE;
  }
  u_fclose(f_enter);
}
if (concord_options->result_mode==INDEX_ || concord_options->result_mode==UIMA_ || 
    concord_options->result_mode==XML_ || concord_options->result_mode==XML_WITH_HEADER_ ||
    concord_options->result_mode==AXIS_) {
   /* We force some options for index, uima and axis files */
   concord_options->left_context=0;
   concord_options->right_context=0;
   concord_options->sort_mode=TEXT_ORDER;
}
if (concord_options->only_ambiguous && concord_options->result_mode!=LEMMATIZE_) {
  /* We force text order when displaying only ambiguous outputs */
  concord_options->sort_mode=TEXT_ORDER;
}
if (concord_options->result_mode==HTML_ || concord_options->result_mode==DIFF_ || concord_options->result_mode==LEMMATIZE_) {
  /* We need the offset file if and only if we have to produce
   * an html concordance with positions in .snt file */
  concord_options->snt_offsets=load_snt_offsets(snt_files->snt_offsets_pos);
  if (concord_options->snt_offsets==NULL) {
    error("Cannot read snt offset file %s\n",snt_files->snt_offsets_pos);
    free(enter_pos);    
    free_text_tokens(tok);
    af_close_mapfile(text);
    free_snt_files(snt_files);
    u_fclose(concor);
    free_conc_opt(concord_options);
    return DEFAULT_ERROR_CODE;    
  }
}
if (offset_file[0]!='\0') {
  concord_options->uima_offsets=load_uima_offsets(&vec,offset_file);
  if (concord_options->uima_offsets==NULL) {
    error("Cannot read offset file %s\n",offset_file);
    free(enter_pos); 
    free_text_tokens(tok);
    af_close_mapfile(text);
    free_snt_files(snt_files);
    u_fclose(concor);
    free_conc_opt(concord_options);
    return DEFAULT_ERROR_CODE;    
  }
}
if (PRLG[0]!='\0') {
  concord_options->PRLG_data=load_PRLG_data(&vec,PRLG);
  if (concord_options->PRLG_data==NULL) {
    error("Cannot read PRLG file %s\n",PRLG);
    free(enter_pos);
    free_text_tokens(tok);
    af_close_mapfile(text);
    free_snt_files(snt_files);
    u_fclose(concor);
    free_conc_opt(concord_options);
    return DEFAULT_ERROR_CODE;    
  }
}
if (concord_options->result_mode==CSV_) {
  concord_options->sort_mode=TEXT_ORDER;
  concord_options->only_matches=1;
}

/* Once we have set all parameters, we call the function that
 * will actually create the concordance. */
create_concordance(&vec,concor,text,tok,n_enter_char,enter_pos,concord_options);

free(enter_pos);
free_text_tokens(tok);
af_close_mapfile(text);
free_snt_files(snt_files);
u_fclose(concor);
free_conc_opt(concord_options);

u_printf("Done.\n");

return SUCCESS_RETURN_CODE;
}