// // this function analyses russian compound words // void analyse_compounds(const Alphabet* alph, Dictionary* d, U_FILE* words, U_FILE* result, U_FILE* debug, U_FILE* new_unknown_words,struct utags UTAG) { bool* prefix; bool* suffix; vector_ptr* rules=new_vector_ptr(16); vector_ptr* entries=new_vector_ptr(16); init_tableaux(d->inf,&prefix,&suffix,UTAG); analyse_word_list(d,words,result,debug,new_unknown_words,alph,prefix,suffix,UTAG,rules,entries); free_tableaux(prefix,suffix); free_vector_ptr(rules); free_vector_ptr(entries); }
// // this function analyses russian compound words // void analyse_compounds(const Alphabet* alph, const unsigned char* bin, const struct INF_codes* inf, U_FILE* words, U_FILE* result, U_FILE* debug, U_FILE* new_unknown_words,struct utags UTAG) { bool* prefix; bool* suffix; vector_ptr* rules=new_vector_ptr(16); vector_ptr* entries=new_vector_ptr(16); init_tableaux(inf,&prefix,&suffix,UTAG); analyse_word_list(bin,inf,words,result,debug,new_unknown_words,alph,prefix,suffix,UTAG,rules,entries); free_tableaux(prefix,suffix); free_vector_ptr(rules); free_vector_ptr(entries); }
/** * Frees a locate_parameters structure. */ void free_locate_parameters(struct locate_parameters* p) { if (p==NULL) return; if (p->recyclable_wchart_buffer!=NULL) { free(p->recyclable_wchart_buffer); } if (p->recyclable_unichar_buffer!=NULL) { free(p->recyclable_unichar_buffer); } free_vector_ptr(p->cached_match_vector,NULL); free(p); }
/** * This function explores the partial matches that constitute the given match in order to produce * one or all possible outputs, depending on infos->ambiguous_output_policy. * The output(s) is(are) then used to add matches to the infos->matches list. */ void explore_match_to_get_outputs(struct locate_tfst_infos* infos,struct tfst_match* m, struct tfst_simple_match_list* element) { /* As m is a reversed list, we first need to get its elements in the right order */ vector_ptr* items=new_vector_ptr(16); fill_vector(items,m); Ustring* s=new_Ustring(1024); /* In MERGE/REPLACE mode, we have to explore the combination of partial matches */ struct list_pointer* ptr=NULL; explore_match_for_MERGE_or_REPLACE_mode(infos,element,items,0,s,-1,&ptr); free_list_pointer(ptr); free_Ustring(s); free_vector_ptr(items); }
/* * This function behaves in the same way that a main one, except that it does * not invoke the setBufferMode function. */ int main_LocateTfst(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; char text[FILENAME_MAX]=""; char alphabet[FILENAME_MAX]=""; int is_korean=0; int tilde_negation_operator=1; int selected_negation_operator=0; int tagging=0; int single_tags_only=0; int match_word_boundaries=1; MatchPolicy match_policy=LONGEST_MATCHES; OutputPolicy output_policy=IGNORE_OUTPUTS; AmbiguousOutputPolicy ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS; VariableErrorPolicy variable_error_policy=IGNORE_VARIABLE_ERRORS; int search_limit=NO_MATCH_LIMIT; char foo; vector_ptr* injected=new_vector_ptr(); bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_LocateTfst,lopts_LocateTfst,&index))) { switch(val) { case 't': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty .tfst name\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } strcpy(text,options.vars()->optarg); break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet name\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'K': is_korean=1; match_word_boundaries=0; break; case 'l': search_limit=NO_MATCH_LIMIT; break; case 'g': if (options.vars()->optarg[0]=='\0') { error("You must specify an argument for negation operator\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } selected_negation_operator=1; if ((strcmp(options.vars()->optarg,"minus")==0) || (strcmp(options.vars()->optarg,"-")==0)) { tilde_negation_operator=0; } else if ((strcmp(options.vars()->optarg,"tilde")!=0) && (strcmp(options.vars()->optarg,"~")!=0)) { error("You must specify a valid argument for negation operator\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } break; case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&search_limit,&foo) || search_limit<=0) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid search limit argument: %s\n",options.vars()->optarg); free_vector_ptr(injected); return USAGE_ERROR_CODE; } break; case 'S': match_policy=SHORTEST_MATCHES; break; case 'L': match_policy=LONGEST_MATCHES; break; case 'A': match_policy=ALL_MATCHES; break; case 'I': output_policy=IGNORE_OUTPUTS; break; case 'M': output_policy=MERGE_OUTPUTS; break; case 'R': output_policy=REPLACE_OUTPUTS; break; case 'X': variable_error_policy=EXIT_ON_VARIABLE_ERRORS; break; case 'Y': variable_error_policy=IGNORE_VARIABLE_ERRORS; break; case 'Z': variable_error_policy=BACKTRACK_ON_VARIABLE_ERRORS; break; case 'b': ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS; break; case 'z': ambiguous_output_policy=IGNORE_AMBIGUOUS_OUTPUTS; break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case 1: tagging=1; break; case 2: single_tags_only=1; break; case 3: match_word_boundaries=0; break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'v': { unichar* key=u_strdup(options.vars()->optarg); unichar* value=u_strchr(key,'='); if (value==NULL) { error("Invalid variable injection: %s\n",options.vars()->optarg); free_vector_ptr(injected); return USAGE_ERROR_CODE; } (*value)='\0'; value++; value=u_strdup(value); vector_ptr_add(injected,key); vector_ptr_add(injected,value); break; } case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_LocateTfst[index].name); free_vector_ptr(injected); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); free_vector_ptr(injected); return USAGE_ERROR_CODE; break; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); free_vector_ptr(injected); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory free_vector_ptr(injected); return SUCCESS_RETURN_CODE; } if (selected_negation_operator==0) { get_graph_compatibility_mode_by_file(&vec,&tilde_negation_operator); } char grammar[FILENAME_MAX]; char output[FILENAME_MAX]; strcpy(grammar,argv[options.vars()->optind]); get_path(text,output); strcat(output,"concord.ind"); int OK=locate_tfst(text, grammar, alphabet, output, &vec, match_policy, output_policy, ambiguous_output_policy, variable_error_policy, search_limit, is_korean, tilde_negation_operator, injected, tagging, single_tags_only, match_word_boundaries); free_vector_ptr(injected); return (!OK); }
/** * The same than main, but no call to setBufferMode. */ int main_KeyWords(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } VersatileEncodingConfig vec=VEC_DEFAULT; char tokens[FILENAME_MAX]; char output[FILENAME_MAX]=""; char alph[FILENAME_MAX]=""; char cdic[FILENAME_MAX]=""; unichar* code=u_strdup("XXX"); int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_KeyWords,lopts_KeyWords,&index))) { switch(val) { case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output\n"); free(code); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); free(code); return USAGE_ERROR_CODE; } strcpy(alph,options.vars()->optarg); break; case 'f': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty forbidden code\n"); free(code); return USAGE_ERROR_CODE; } free(code); code=u_strdup(options.vars()->optarg); break; case 'c': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty file name\n"); free(code); return USAGE_ERROR_CODE; } strcpy(cdic,options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); free(code); return SUCCESS_RETURN_CODE; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); free(code); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); free(code); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_KeyWords[index].name); free(code); return USAGE_ERROR_CODE; break; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); free(code); return USAGE_ERROR_CODE; break; } index=-1; } if (options.vars()->optind==argc || options.vars()->optind==argc-1) { error("Invalid arguments: rerun with --help\n"); free(code); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory free(code); return SUCCESS_RETURN_CODE; } Alphabet* alphabet=NULL; if (alph[0]!='\0') { alphabet=load_alphabet(&vec,alph); if (alphabet==NULL) { error("Cannot load alphabet file %s\n",alph); free(code); return DEFAULT_ERROR_CODE; } } strcpy(tokens,argv[(options.vars()->optind++)]); if (output[0]=='\0') { get_path(tokens,output); strcat(output,"keywords.txt"); } struct string_hash_ptr* keywords=load_tokens_by_freq(tokens,&vec); filter_non_letter_keywords(keywords,alphabet); if (cdic[0]!='\0') { load_compound_words(cdic,&vec,keywords); } for (;options.vars()->optind!=argc;(options.vars()->optind)++) { filter_keywords_with_dic(keywords,argv[options.vars()->optind],&vec,alphabet); } merge_case_equivalent_unknown_words(keywords,alphabet); struct string_hash* forbidden_lemmas=compute_forbidden_lemmas(keywords,code); remove_keywords_with_forbidden_lemma(keywords,forbidden_lemmas); free_string_hash(forbidden_lemmas); vector_ptr* sorted=sort_keywords(keywords); U_FILE* f_output=u_fopen(&vec,output,U_WRITE); if (f_output==NULL) { error("Cannot write in file %s\n",output); free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list); free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list); free_alphabet(alphabet); free(code); return DEFAULT_ERROR_CODE; } dump_keywords(sorted,f_output); u_fclose(f_output); free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list); free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list); free_alphabet(alphabet); free(code); return SUCCESS_RETURN_CODE; }
int main_Tokenize(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } char alphabet[FILENAME_MAX]=""; char token_file[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; int mode=NORMAL; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Tokenize,lopts_Tokenize,&index,vars))) { switch(val) { case 'a': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty alphabet file name\n"); } strcpy(alphabet,vars->optarg); break; case 'c': mode=CHAR_BY_CHAR; break; case 'w': mode=NORMAL; break; case 't': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty token file name\n"); } strcpy(token_file,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_Tokenize[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } U_FILE* text; U_FILE* out; U_FILE* output; U_FILE* enter; char tokens_txt[FILENAME_MAX]; char text_cod[FILENAME_MAX]; char enter_pos[FILENAME_MAX]; Alphabet* alph=NULL; get_snt_path(argv[vars->optind],text_cod); strcat(text_cod,"text.cod"); get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tokens.txt"); get_snt_path(argv[vars->optind],enter_pos); strcat(enter_pos,"enter.pos"); text=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (text==NULL) { fatal_error("Cannot open text file %s\n",argv[vars->optind]); } if (alphabet[0]!='\0') { alph=load_alphabet(alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); u_fclose(text); return 1; } } out=u_fopen(BINARY,text_cod,U_WRITE); if (out==NULL) { error("Cannot create file %s\n",text_cod); u_fclose(text); if (alph!=NULL) { free_alphabet(alph); } return 1; } enter=u_fopen(BINARY,enter_pos,U_WRITE); if (enter==NULL) { error("Cannot create file %s\n",enter_pos); u_fclose(text); u_fclose(out); if (alph!=NULL) { free_alphabet(alph); } return 1; } vector_ptr* tokens=new_vector_ptr(4096); vector_int* n_occur=new_vector_int(4096); vector_int* n_enter_pos=new_vector_int(4096); struct hash_table* hashtable=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal, (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy); if (token_file[0]!='\0') { load_token_file(token_file,mask_encoding_compatibility_input,tokens,hashtable,n_occur); } output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot create file %s\n",tokens_txt); u_fclose(text); u_fclose(out); u_fclose(enter); if (alph!=NULL) { free_alphabet(alph); } free_hash_table(hashtable); free_vector_ptr(tokens,free); free_vector_int(n_occur); free_vector_int(n_enter_pos); return 1; } u_fprintf(output,"0000000000\n"); int SENTENCES=0; int TOKENS_TOTAL=0; int WORDS_TOTAL=0; int DIGITS_TOTAL=0; u_printf("Tokenizing text...\n"); if (mode==NORMAL) { normal_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos, &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL); } else { char_by_char_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos, &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL); } u_printf("\nDone.\n"); save_new_line_positions(enter,n_enter_pos); u_fclose(enter); u_fclose(text); u_fclose(out); u_fclose(output); write_number_of_tokens(tokens_txt,encoding_output,bom_output,tokens->nbelems); // we compute some statistics get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"stats.n"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { compute_statistics(output,tokens,alph,SENTENCES,TOKENS_TOTAL,WORDS_TOTAL,DIGITS_TOTAL); u_fclose(output); } // we save the tokens by frequence get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tok_by_freq.txt"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { sort_and_save_by_frequence(output,tokens,n_occur); u_fclose(output); } // we save the tokens by alphabetical order get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tok_by_alph.txt"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { sort_and_save_by_alph_order(output,tokens,n_occur); u_fclose(output); } free_hash_table(hashtable); free_vector_ptr(tokens,free); free_vector_int(n_occur); free_vector_int(n_enter_pos); if (alph!=NULL) { free_alphabet(alph); } free_OptVars(vars); return 0; }
/** * Frees all the memory associated to the given invoker. */ void free_ProgramInvoker(ProgramInvoker* i) { if (i==NULL) return; free_vector_ptr(i->args,free); free(i); }