// // this function analyses russian compound words // void analyse_compounds(const Alphabet* alph, Dictionary* d, U_FILE* words, U_FILE* result, U_FILE* debug, U_FILE* new_unknown_words,struct utags UTAG) { bool* prefix; bool* suffix; vector_ptr* rules=new_vector_ptr(16); vector_ptr* entries=new_vector_ptr(16); init_tableaux(d->inf,&prefix,&suffix,UTAG); analyse_word_list(d,words,result,debug,new_unknown_words,alph,prefix,suffix,UTAG,rules,entries); free_tableaux(prefix,suffix); free_vector_ptr(rules); free_vector_ptr(entries); }
// // this function analyses russian compound words // void analyse_compounds(const Alphabet* alph, const unsigned char* bin, const struct INF_codes* inf, U_FILE* words, U_FILE* result, U_FILE* debug, U_FILE* new_unknown_words,struct utags UTAG) { bool* prefix; bool* suffix; vector_ptr* rules=new_vector_ptr(16); vector_ptr* entries=new_vector_ptr(16); init_tableaux(inf,&prefix,&suffix,UTAG); analyse_word_list(bin,inf,words,result,debug,new_unknown_words,alph,prefix,suffix,UTAG,rules,entries); free_tableaux(prefix,suffix); free_vector_ptr(rules); free_vector_ptr(entries); }
/** * This function explores the partial matches that constitute the given match in order to produce * one or all possible outputs, depending on infos->ambiguous_output_policy. * The output(s) is(are) then used to add matches to the infos->matches list. */ void explore_match_to_get_outputs(struct locate_tfst_infos* infos,struct tfst_match* m, struct tfst_simple_match_list* element) { /* As m is a reversed list, we first need to get its elements in the right order */ vector_ptr* items=new_vector_ptr(16); fill_vector(items,m); Ustring* s=new_Ustring(1024); /* In MERGE/REPLACE mode, we have to explore the combination of partial matches */ struct list_pointer* ptr=NULL; explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,0,s,-1,&ptr); free_list_pointer(ptr); free_Ustring(s); free_vector_ptr(items); }
/** * Allocates, initializes and returns a new program invoker. */ ProgramInvoker* new_ProgramInvoker(MAIN_FUNCTION f,const char* name) { if (name==NULL) { fatal_error("NULL program name in new_ProgramInvoker\n"); } ProgramInvoker* res=(ProgramInvoker*)malloc(sizeof(ProgramInvoker)); if (res==NULL) { fatal_alloc_error("new_ProgramInvoker"); } res->main=f; res->args=new_vector_ptr(16); add_argument(res,name); return res; }
/** * We build an array of single keywords (lists of only one element), * sorted by descending weight. */ vector_ptr* sort_keywords(struct string_hash_ptr* keywords) { vector_ptr* res=new_vector_ptr(); for (int i=0;i<keywords->size;i++) { KeyWord* k=(KeyWord*)(keywords->value[i]); while (k!=NULL) { if (k->sequence!=NULL && k->lemmatized!=PART_OF_A_LEMMATIZED_KEYWORD) { vector_ptr_add(res,new_KeyWord(k->weight,k->sequence,NULL)); } k=k->next; } } qsort(res->tab,res->nbelems,sizeof(KeyWord*),(int(*)(const void*,const void*))cmp_keywords); return res; }
/* * This function behaves in the same way that a main one, except that it does * not invoke the setBufferMode function. */ int main_LocateTfst(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; char text[FILENAME_MAX]=""; char alphabet[FILENAME_MAX]=""; int is_korean=0; int tilde_negation_operator=1; int selected_negation_operator=0; int tagging=0; int single_tags_only=0; int match_word_boundaries=1; MatchPolicy match_policy=LONGEST_MATCHES; OutputPolicy output_policy=IGNORE_OUTPUTS; AmbiguousOutputPolicy ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS; VariableErrorPolicy variable_error_policy=IGNORE_VARIABLE_ERRORS; int search_limit=NO_MATCH_LIMIT; char foo; vector_ptr* injected=new_vector_ptr(); bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_LocateTfst,lopts_LocateTfst,&index))) { switch(val) { case 't': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty .tfst name\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } strcpy(text,options.vars()->optarg); break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet name\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'K': is_korean=1; match_word_boundaries=0; break; case 'l': search_limit=NO_MATCH_LIMIT; break; case 'g': if (options.vars()->optarg[0]=='\0') { error("You must specify an argument for negation operator\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } selected_negation_operator=1; if ((strcmp(options.vars()->optarg,"minus")==0) || (strcmp(options.vars()->optarg,"-")==0)) { tilde_negation_operator=0; } else if ((strcmp(options.vars()->optarg,"tilde")!=0) && (strcmp(options.vars()->optarg,"~")!=0)) { error("You must specify a valid argument for negation operator\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } break; case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&search_limit,&foo) || search_limit<=0) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid search limit argument: %s\n",options.vars()->optarg); free_vector_ptr(injected); return USAGE_ERROR_CODE; } break; case 'S': match_policy=SHORTEST_MATCHES; break; case 'L': match_policy=LONGEST_MATCHES; break; case 'A': match_policy=ALL_MATCHES; break; case 'I': output_policy=IGNORE_OUTPUTS; break; case 'M': output_policy=MERGE_OUTPUTS; break; case 'R': output_policy=REPLACE_OUTPUTS; break; case 'X': variable_error_policy=EXIT_ON_VARIABLE_ERRORS; break; case 'Y': variable_error_policy=IGNORE_VARIABLE_ERRORS; break; case 'Z': variable_error_policy=BACKTRACK_ON_VARIABLE_ERRORS; break; case 'b': ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS; break; case 'z': ambiguous_output_policy=IGNORE_AMBIGUOUS_OUTPUTS; break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case 1: tagging=1; break; case 2: single_tags_only=1; break; case 3: match_word_boundaries=0; break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'v': { unichar* key=u_strdup(options.vars()->optarg); unichar* value=u_strchr(key,'='); if (value==NULL) { error("Invalid variable injection: %s\n",options.vars()->optarg); free_vector_ptr(injected); return USAGE_ERROR_CODE; } (*value)='\0'; value++; value=u_strdup(value); vector_ptr_add(injected,key); vector_ptr_add(injected,value); break; } case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_LocateTfst[index].name); free_vector_ptr(injected); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); free_vector_ptr(injected); return USAGE_ERROR_CODE; break; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory free_vector_ptr(injected); return SUCCESS_RETURN_CODE; } if (selected_negation_operator==0) { get_graph_compatibility_mode_by_file(&vec,&tilde_negation_operator); } char grammar[FILENAME_MAX]; char output[FILENAME_MAX]; strcpy(grammar,argv[options.vars()->optind]); get_path(text,output); strcat(output,"concord.ind"); int OK=locate_tfst(text, grammar, alphabet, output, &vec, match_policy, output_policy, ambiguous_output_policy, variable_error_policy, search_limit, is_korean, tilde_negation_operator, injected, tagging, single_tags_only, match_word_boundaries); free_vector_ptr(injected); return (!OK); }
/** * Allocates, initializes and returns a new locate_parameters structure. */ struct locate_parameters* new_locate_parameters() { struct locate_parameters* p=(struct locate_parameters*)malloc(sizeof(struct locate_parameters)); if (p==NULL) { fatal_alloc_error("new_locate_parameters"); } p->tilde_negation_operator=1; p->useLocateCache=1; p->token_control=NULL; p->matching_patterns=NULL; p->current_compound_pattern=0; p->pattern_tree_root=NULL; /* We use -1 because there may be no space, {S} or {STOP} in the text */ p->SPACE=-1; p->SENTENCE=-1; p->STOP=-1; p->tag_token_list=NULL; #ifdef TRE_WCHAR p->filters=NULL; p->filter_match_index=NULL; #endif p->DLC_tree=NULL; p->optimized_states=NULL; p->fst2=NULL; p->tokens=NULL; p->current_origin=-1; p->max_count_call=0; p->max_count_call_warning=0; p->buffer=NULL; p->tokenization_policy=WORD_BY_WORD_TOKENIZATION; p->space_policy=DONT_START_WITH_SPACE; p->matching_units=0; p->match_policy=LONGEST_MATCHES; p->output_policy=IGNORE_OUTPUTS; p->ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS; p->variable_error_policy=IGNORE_VARIABLE_ERRORS; p->match_list=NULL; p->number_of_matches=0; p->number_of_outputs=0; p->start_position_last_printed_match=-1; p->end_position_last_printed_match=-1; p->search_limit=0; p->input_variables=NULL; p->output_variables=NULL; p->nb_output_variables=0; p->stack=new_stack_unichar(TRANSDUCTION_STACK_SIZE); p->alphabet=NULL; p->morpho_dic_inf=NULL; p->morpho_dic_inf_free=NULL; p->morpho_dic_bin=NULL; p->morpho_dic_bin_free=NULL; p->n_morpho_dics=0; p->dic_variables=NULL; p->left_ctx_shift=0; p->left_ctx_base=0; p->protect_dic_chars=0; p->graph_depth=0; p->korean=NULL; p->jamo_tags=NULL; p->mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; p->recyclable_wchart_buffer=(wchar_t*)malloc(sizeof(wchar_t)*SIZE_RECYCLABLE_WCHAR_T_BUFFER); if (p->recyclable_wchart_buffer==NULL) { fatal_alloc_error("new_locate_parameters"); } p->recyclable_unichar_buffer=(unichar*)malloc(sizeof(unichar)*SIZE_RECYCLABLE_UNICHAR_BUFFER); if (p->recyclable_unichar_buffer==NULL) { fatal_alloc_error("new_locate_parameters"); } p->size_recyclable_unichar_buffer = SIZE_RECYCLABLE_UNICHAR_BUFFER; p->failfast=NULL; p->match_cache_first=NULL; p->match_cache_last=NULL; p->match_cache=NULL; p->prv_alloc=NULL; p->prv_alloc_recycle=NULL; p->token_error_ctx.last_length=0; p->token_error_ctx.last_start=0; p->token_error_ctx.n_errors=0; p->token_error_ctx.n_matches_at_token_pos__locate=0; p->token_error_ctx.n_matches_at_token_pos__morphological_locate=0; p->counting_step.count_call=0; p->counting_step.count_cancel_trying=0; p->explore_depth=0; p->backup_memory_reserve=NULL; p->cached_match_vector=new_vector_ptr(16); p->fnc_locate_trace_step=NULL; p->private_param_locate_trace=NULL; memset(&(p->arabic),0,sizeof(ArabicTypoRules)); p->is_in_cancel_state = 0; p->is_in_trace_state = 0; p->counting_step_count_cancel_trying_real_in_debug_or_trace = 0; return p; }
int main_Tokenize(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } char alphabet[FILENAME_MAX]=""; char token_file[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; int mode=NORMAL; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Tokenize,lopts_Tokenize,&index,vars))) { switch(val) { case 'a': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty alphabet file name\n"); } strcpy(alphabet,vars->optarg); break; case 'c': mode=CHAR_BY_CHAR; break; case 'w': mode=NORMAL; break; case 't': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty token file name\n"); } strcpy(token_file,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_Tokenize[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } U_FILE* text; U_FILE* out; U_FILE* output; U_FILE* enter; char tokens_txt[FILENAME_MAX]; char text_cod[FILENAME_MAX]; char enter_pos[FILENAME_MAX]; Alphabet* alph=NULL; get_snt_path(argv[vars->optind],text_cod); strcat(text_cod,"text.cod"); get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tokens.txt"); get_snt_path(argv[vars->optind],enter_pos); strcat(enter_pos,"enter.pos"); text=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (text==NULL) { fatal_error("Cannot open text file %s\n",argv[vars->optind]); } if (alphabet[0]!='\0') { alph=load_alphabet(alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); u_fclose(text); return 1; } } out=u_fopen(BINARY,text_cod,U_WRITE); if (out==NULL) { error("Cannot create file %s\n",text_cod); u_fclose(text); if (alph!=NULL) { free_alphabet(alph); } return 1; } enter=u_fopen(BINARY,enter_pos,U_WRITE); if (enter==NULL) { error("Cannot create file %s\n",enter_pos); u_fclose(text); u_fclose(out); if (alph!=NULL) { free_alphabet(alph); } return 1; } vector_ptr* tokens=new_vector_ptr(4096); vector_int* n_occur=new_vector_int(4096); vector_int* n_enter_pos=new_vector_int(4096); struct hash_table* hashtable=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal, (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy); if (token_file[0]!='\0') { load_token_file(token_file,mask_encoding_compatibility_input,tokens,hashtable,n_occur); } output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot create file %s\n",tokens_txt); u_fclose(text); u_fclose(out); u_fclose(enter); if (alph!=NULL) { free_alphabet(alph); } free_hash_table(hashtable); free_vector_ptr(tokens,free); free_vector_int(n_occur); free_vector_int(n_enter_pos); return 1; } u_fprintf(output,"0000000000\n"); int SENTENCES=0; int TOKENS_TOTAL=0; int WORDS_TOTAL=0; int DIGITS_TOTAL=0; u_printf("Tokenizing text...\n"); if (mode==NORMAL) { normal_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos, &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL); } else { char_by_char_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos, &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL); } u_printf("\nDone.\n"); save_new_line_positions(enter,n_enter_pos); u_fclose(enter); u_fclose(text); u_fclose(out); u_fclose(output); write_number_of_tokens(tokens_txt,encoding_output,bom_output,tokens->nbelems); // we compute some statistics get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"stats.n"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { compute_statistics(output,tokens,alph,SENTENCES,TOKENS_TOTAL,WORDS_TOTAL,DIGITS_TOTAL); u_fclose(output); } // we save the tokens by frequence get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tok_by_freq.txt"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { sort_and_save_by_frequence(output,tokens,n_occur); u_fclose(output); } // we save the tokens by alphabetical order get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tok_by_alph.txt"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { sort_and_save_by_alph_order(output,tokens,n_occur); u_fclose(output); } free_hash_table(hashtable); free_vector_ptr(tokens,free); free_vector_int(n_occur); free_vector_int(n_enter_pos); if (alph!=NULL) { free_alphabet(alph); } free_OptVars(vars); return 0; }