/** * Loads a .tfst file with the given name, according to the * given language description. */ Elag_Tfst_file_in* load_tfst_file(const VersatileEncodingConfig* vec,const char* fname,language_t* language) { Elag_Tfst_file_in* fstf=(Elag_Tfst_file_in*)malloc(sizeof(Elag_Tfst_file_in)); if (fstf==NULL) { fatal_alloc_error("load_tfst_file"); } fstf->tfst=open_text_automaton(vec,fname); fstf->language=language; return fstf; }
int main_Evamb(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int val,index=-1; int sentence_number=-1; const char* outfilename=NULL; char output_name_buffer[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Evamb,lopts_Evamb,&index,vars))) { switch(val) { case 's': { char c_foo; if (1!=sscanf(vars->optarg,"%d%c",&sentence_number,&c_foo) || sentence_number<=0) { /* foo is used to check that the sentence number is not like "45gjh" */ fatal_error("Invalid sentence number: %s\n",vars->optarg); } } break; case 'o': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty output file name\n"); } strcpy(output_name_buffer,vars->optarg); outfilename=output_name_buffer; break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_Evamb[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } u_printf("Loading '%s'...\n",argv[vars->optind]); Tfst* tfst=open_text_automaton(argv[vars->optind]); if (tfst==NULL) { fatal_error("Unable to load '%s'\n",argv[vars->optind]); } if (sentence_number>tfst->N) { fatal_error("Invalid sentence number %d: should be in [1;%d]\n",sentence_number,tfst->N); } U_FILE* outfile = (outfilename == NULL) ? U_STDOUT : u_fopen_creating_versatile_encoding(encoding_output,bom_output, outfilename, U_WRITE); if (outfile==NULL) { close_text_automaton(tfst); free_OptVars(vars); error("Cannot create file %s\n",outfilename); return 1; } if (sentence_number==-1) { /* If we have to evaluate the ambiguity rate of the whole automaton */ double lognp_total=0.0; double lmoy_total=0.0; double maxlogamb=0.0; double minlogamb=(double)INT_MAX; /* This is the number of bad automata in the text .fst2 */ int n_bad_automata=0; int maxambno=-1; int minambno=-1; for (sentence_number=1;sentence_number<=tfst->N;sentence_number++) { load_sentence(tfst,sentence_number); SingleGraph graph=tfst->automaton; if (graph->number_of_states==0 || graph->states[0]->outgoing_transitions==NULL) { n_bad_automata++; error("Sentence %d: empty automaton\n",sentence_number); } else { /* log(number of paths) */ double lognp; /* minimum/maximum path length */ int lmin,lmax; /* Approximation of the sentence length */ double lmoy; /* log(ambiguity rate) */ double logamb; lognp=evaluate_ambiguity(graph,&lmin,&lmax); lmoy=(double)(lmin+lmax)/2.0; logamb=lognp/lmoy; if (maxlogamb<logamb) { maxlogamb=logamb; maxambno=sentence_number; } if (minlogamb>logamb) { minlogamb=logamb; minambno=sentence_number; } u_printf("Sentence %d \r",sentence_number); lognp_total=lognp_total+lognp; lmoy_total=lmoy_total+lmoy; } } if (n_bad_automata>=tfst->N) { error("No stats to print because no non-empty sentence automata were found.\n"); } else { u_fprintf(outfile,"%d/%d sentence%s taken into account\n",tfst->N-n_bad_automata,tfst->N,(tfst->N>1)?"s":""); u_fprintf(outfile,"Average ambiguity rate=%.3f\n",exp(lognp_total/lmoy_total)); u_fprintf(outfile,"Minimum ambiguity rate=%.3f (sentence %d)\n",exp(minlogamb),minambno); u_fprintf(outfile,"Maximum ambiguity rate=%.3f (sentence %d)\n",exp(maxlogamb),maxambno); } } else { /* If we have to evaluate the ambiguity rate of a single sentence automaton */ load_sentence(tfst,sentence_number); SingleGraph graph=tfst->automaton; if (graph->number_of_states==0) { error("Sentence %d: empty automaton\n",sentence_number); } else { int min; int max; double lognp=evaluate_ambiguity(graph,&min,&max); double lmoy=(double)(min+max)/2.0; u_fprintf(outfile,"Sentence %d: ambiguity rate=%.3f\n",sentence_number,exp(lognp/lmoy)); } } if (outfile!=U_STDOUT) u_fclose(outfile); close_text_automaton(tfst); free_OptVars(vars); return 0; }
int main_RebuildTfst(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } VersatileEncodingConfig vec=VEC_DEFAULT; int val, index=-1; bool only_verify_arguments = false; UnitexGetOpt options; int save_statistics=1; while (EOF!=(val=options.parse_long(argc,argv,optstring_RebuildTfst,lopts_RebuildTfst,&index))) { switch (val) { case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'S': save_statistics = 0; break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n", options.vars()->optopt) : error("Missing argument for option --%s\n", lopts_RebuildTfst[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n", options.vars()->optopt) : error("Invalid option --%s\n", options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } char input_tfst[FILENAME_MAX]; char input_tind[FILENAME_MAX]; strcpy(input_tfst,argv[options.vars()->optind]); remove_extension(input_tfst,input_tind); strcat(input_tind,".tind"); u_printf("Loading %s...\n",input_tfst); Tfst* tfst = open_text_automaton(&vec,input_tfst); if (tfst==NULL) { error("Unable to load %s automaton\n",input_tfst); return DEFAULT_ERROR_CODE; } char basedir[FILENAME_MAX]; get_path(input_tfst,basedir); char output_tfst[FILENAME_MAX]; sprintf(output_tfst, "%s.new.tfst",input_tfst); char output_tind[FILENAME_MAX]; sprintf(output_tind, "%s.new.tind",input_tfst); U_FILE* f_tfst; if ((f_tfst = u_fopen(&vec,output_tfst,U_WRITE)) == NULL) { error("Unable to open %s for writing\n", output_tfst); close_text_automaton(tfst); return DEFAULT_ERROR_CODE; } U_FILE* f_tind; if ((f_tind = u_fopen(BINARY,output_tind,U_WRITE)) == NULL) { u_fclose(f_tfst); close_text_automaton(tfst); error("Unable to open %s for writing\n", output_tind); return DEFAULT_ERROR_CODE; } /* We use this hash table to rebuild files tfst_tags_by_freq/alph.txt */ struct hash_table* form_frequencies=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal, (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy); u_fprintf(f_tfst,"%010d\n",tfst->N); for (int i = 1; i <= tfst->N; i++) { if ((i % 100) == 0) { u_printf("%d/%d sentences rebuilt...\n", i, tfst->N); } load_sentence(tfst,i); char grfname[FILENAME_MAX]; sprintf(grfname, "%ssentence%d.grf", basedir, i); unichar** tags=NULL; int n_tags=-1; if (fexists(grfname)) { /* If there is a .grf for the current sentence, then we must * take it into account */ if (0==pseudo_main_Grf2Fst2(&vec,grfname,0,NULL,1,1,NULL,NULL,0)) { /* We proceed only if the graph compilation was a success */ char fst2name[FILENAME_MAX]; sprintf(fst2name, "%ssentence%d.fst2", basedir, i); struct FST2_free_info fst2_free; Fst2* fst2=load_abstract_fst2(&vec,fst2name,0,&fst2_free); af_remove(fst2name); free_SingleGraph(tfst->automaton,NULL); tfst->automaton=create_copy_of_fst2_subgraph(fst2,1); tags=create_tfst_tags(fst2,&n_tags); free_abstract_Fst2(fst2,&fst2_free); } else { error("Error: %s is not a valid sentence automaton\n",grfname); } } save_current_sentence(tfst,f_tfst,f_tind,tags,n_tags,form_frequencies); if (tags!=NULL) { /* If necessary, we free the tags we created */ for (int count_tags=0;count_tags<n_tags;count_tags++) { free(tags[count_tags]); } free(tags); } } u_printf("Text automaton rebuilt.\n"); u_fclose(f_tind); u_fclose(f_tfst); close_text_automaton(tfst); /* Finally, we save statistics */ if (save_statistics) { char tfst_tags_by_freq[FILENAME_MAX]; char tfst_tags_by_alph[FILENAME_MAX]; strcpy(tfst_tags_by_freq, basedir); strcat(tfst_tags_by_freq, "tfst_tags_by_freq.txt"); strcpy(tfst_tags_by_alph, basedir); strcat(tfst_tags_by_alph, "tfst_tags_by_alph.txt"); U_FILE* f_tfst_tags_by_freq = u_fopen(&vec, tfst_tags_by_freq, U_WRITE); if (f_tfst_tags_by_freq == NULL) { error("Cannot open %s\n", tfst_tags_by_freq); } U_FILE* f_tfst_tags_by_alph = u_fopen(&vec, tfst_tags_by_alph, U_WRITE); if (f_tfst_tags_by_alph == NULL) { error("Cannot open %s\n", tfst_tags_by_alph); } sort_and_save_tfst_stats(form_frequencies, f_tfst_tags_by_freq, f_tfst_tags_by_alph); u_fclose(f_tfst_tags_by_freq); u_fclose(f_tfst_tags_by_alph); } free_hash_table(form_frequencies); /* make a backup and replace old automaton with new */ char backup_tfst[FILENAME_MAX]; char backup_tind[FILENAME_MAX]; sprintf(backup_tfst,"%s.bck",input_tfst); sprintf(backup_tind,"%s.bck",input_tind); /* We remove the existing backup files, if any */ af_remove(backup_tfst); af_remove(backup_tind); af_rename(input_tfst,backup_tfst); af_rename(input_tind,backup_tind); af_rename(output_tfst,input_tfst); af_rename(output_tind,input_tind); u_printf("\nYou can find a backup of the original files in:\n %s\nand %s\n", backup_tfst,backup_tind); return SUCCESS_RETURN_CODE; }