/**
 * Loads a .tfst file with the given name, according to the
 * given language description.
 */
Elag_Tfst_file_in* load_tfst_file(const VersatileEncodingConfig* vec,const char* fname,language_t* language) {
Elag_Tfst_file_in* fstf=(Elag_Tfst_file_in*)malloc(sizeof(Elag_Tfst_file_in));
if (fstf==NULL) {
   fatal_alloc_error("load_tfst_file");
}
fstf->tfst=open_text_automaton(vec,fname);
fstf->language=language;
return fstf;
}
Example #2
0
File: Evamb.cpp Project: adri87/Q-A
int main_Evamb(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}


int val,index=-1;
int sentence_number=-1;
const char* outfilename=NULL;
char output_name_buffer[FILENAME_MAX]="";

Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;


struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Evamb,lopts_Evamb,&index,vars))) {
   switch(val) {
   case 's':   {  char c_foo;
                  if (1!=sscanf(vars->optarg,"%d%c",&sentence_number,&c_foo) || sentence_number<=0) {
                    /* foo is used to check that the sentence number is not like "45gjh" */
                    fatal_error("Invalid sentence number: %s\n",vars->optarg);
                  }
                }
                break;
      case 'o': if (vars->optarg[0]=='\0') {
                   fatal_error("You must specify a non empty output file name\n");
                }
                strcpy(output_name_buffer,vars->optarg);
                outfilename=output_name_buffer;
                break;
      case 'k': if (vars->optarg[0]=='\0') {
                  fatal_error("Empty input_encoding argument\n");
                }
                decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
                break;
      case 'q': if (vars->optarg[0]=='\0') {
                  fatal_error("Empty output_encoding argument\n");
                }
                decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
                break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_Evamb[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}

u_printf("Loading '%s'...\n",argv[vars->optind]);
Tfst* tfst=open_text_automaton(argv[vars->optind]);
if (tfst==NULL) {
   fatal_error("Unable to load '%s'\n",argv[vars->optind]);
}
if (sentence_number>tfst->N) {
   fatal_error("Invalid sentence number %d: should be in [1;%d]\n",sentence_number,tfst->N);
}
U_FILE* outfile = (outfilename == NULL) ? U_STDOUT : u_fopen_creating_versatile_encoding(encoding_output,bom_output, outfilename, U_WRITE);
if (outfile==NULL) {
    close_text_automaton(tfst);
    free_OptVars(vars);
    error("Cannot create file %s\n",outfilename);
    return 1;
}
if (sentence_number==-1) {
   /* If we have to evaluate the ambiguity rate of the whole automaton */
   double lognp_total=0.0;
   double lmoy_total=0.0;
   double maxlogamb=0.0;
   double minlogamb=(double)INT_MAX;
   /* This is the number of bad automata in the text .fst2 */
   int n_bad_automata=0;
   int maxambno=-1;
   int minambno=-1;
   for (sentence_number=1;sentence_number<=tfst->N;sentence_number++) {
      load_sentence(tfst,sentence_number);
      SingleGraph graph=tfst->automaton;
      if (graph->number_of_states==0 || graph->states[0]->outgoing_transitions==NULL) {
         n_bad_automata++;
         error("Sentence %d: empty automaton\n",sentence_number);
      } else {
         /* log(number of paths) */
         double lognp;
         /* minimum/maximum path length */
         int lmin,lmax;
         /* Approximation of the sentence length */
         double lmoy;
         /* log(ambiguity rate) */
         double logamb;
         lognp=evaluate_ambiguity(graph,&lmin,&lmax);
         lmoy=(double)(lmin+lmax)/2.0;
         logamb=lognp/lmoy;
         if (maxlogamb<logamb) {
            maxlogamb=logamb;
            maxambno=sentence_number;
         }
         if (minlogamb>logamb) {
            minlogamb=logamb;
            minambno=sentence_number;
         }
         u_printf("Sentence %d            \r",sentence_number);
         lognp_total=lognp_total+lognp;
         lmoy_total=lmoy_total+lmoy;
      }
   }
   if (n_bad_automata>=tfst->N) {
      error("No stats to print because no non-empty sentence automata were found.\n");
   } else {
      u_fprintf(outfile,"%d/%d sentence%s taken into account\n",tfst->N-n_bad_automata,tfst->N,(tfst->N>1)?"s":"");
      u_fprintf(outfile,"Average ambiguity rate=%.3f\n",exp(lognp_total/lmoy_total));
      u_fprintf(outfile,"Minimum ambiguity rate=%.3f (sentence %d)\n",exp(minlogamb),minambno);
      u_fprintf(outfile,"Maximum ambiguity rate=%.3f (sentence %d)\n",exp(maxlogamb),maxambno);
   }
} else {
   /* If we have to evaluate the ambiguity rate of a single sentence automaton */
   load_sentence(tfst,sentence_number);
   SingleGraph graph=tfst->automaton;
   if (graph->number_of_states==0) {
      error("Sentence %d: empty automaton\n",sentence_number);
   } else {
      int min;
      int max;
      double lognp=evaluate_ambiguity(graph,&min,&max);
      double lmoy=(double)(min+max)/2.0;
      u_fprintf(outfile,"Sentence %d: ambiguity rate=%.3f\n",sentence_number,exp(lognp/lmoy));
   }
}
if (outfile!=U_STDOUT)
  u_fclose(outfile);

close_text_automaton(tfst);
free_OptVars(vars);
return 0;
}
int main_RebuildTfst(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val, index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
int save_statistics=1;
while (EOF!=(val=options.parse_long(argc,argv,optstring_RebuildTfst,lopts_RebuildTfst,&index))) {
   switch (val) {
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'S': save_statistics = 0;
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h':
      usage();
      return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n", options.vars()->optopt) :
                         error("Missing argument for option --%s\n", lopts_RebuildTfst[index].name);
     return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n", options.vars()->optopt) :
                         error("Invalid option --%s\n", options.vars()->optarg);
     return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char input_tfst[FILENAME_MAX];
char input_tind[FILENAME_MAX];
strcpy(input_tfst,argv[options.vars()->optind]);
remove_extension(input_tfst,input_tind);
strcat(input_tind,".tind");

u_printf("Loading %s...\n",input_tfst);

Tfst* tfst = open_text_automaton(&vec,input_tfst);
if (tfst==NULL) {
   error("Unable to load %s automaton\n",input_tfst);
   return DEFAULT_ERROR_CODE;
}

char basedir[FILENAME_MAX];
get_path(input_tfst,basedir);
char output_tfst[FILENAME_MAX];
sprintf(output_tfst, "%s.new.tfst",input_tfst);
char output_tind[FILENAME_MAX];
sprintf(output_tind, "%s.new.tind",input_tfst);

U_FILE* f_tfst;
if ((f_tfst = u_fopen(&vec,output_tfst,U_WRITE)) == NULL) {
   error("Unable to open %s for writing\n", output_tfst);
   close_text_automaton(tfst);
   return DEFAULT_ERROR_CODE;
}

U_FILE* f_tind;
if ((f_tind = u_fopen(BINARY,output_tind,U_WRITE)) == NULL) {
   u_fclose(f_tfst);
   close_text_automaton(tfst);
   error("Unable to open %s for writing\n", output_tind);
   return DEFAULT_ERROR_CODE;
}
/* We use this hash table to rebuild files tfst_tags_by_freq/alph.txt */
struct hash_table* form_frequencies=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal,
        (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy);

u_fprintf(f_tfst,"%010d\n",tfst->N);
for (int i = 1; i <= tfst->N; i++) {
   if ((i % 100) == 0) {
      u_printf("%d/%d sentences rebuilt...\n", i, tfst->N);
   }
   load_sentence(tfst,i);

   char grfname[FILENAME_MAX];
   sprintf(grfname, "%ssentence%d.grf", basedir, i);
   unichar** tags=NULL;
   int n_tags=-1;
   if (fexists(grfname)) {
      /* If there is a .grf for the current sentence, then we must
       * take it into account */
      if (0==pseudo_main_Grf2Fst2(&vec,grfname,0,NULL,1,1,NULL,NULL,0)) {
         /* We proceed only if the graph compilation was a success */
         char fst2name[FILENAME_MAX];
         sprintf(fst2name, "%ssentence%d.fst2", basedir, i);
         struct FST2_free_info fst2_free;
         Fst2* fst2=load_abstract_fst2(&vec,fst2name,0,&fst2_free);
         af_remove(fst2name);
         free_SingleGraph(tfst->automaton,NULL);
         tfst->automaton=create_copy_of_fst2_subgraph(fst2,1);
         tags=create_tfst_tags(fst2,&n_tags);
         free_abstract_Fst2(fst2,&fst2_free);
      } else {
         error("Error: %s is not a valid sentence automaton\n",grfname);
      }
   }
   save_current_sentence(tfst,f_tfst,f_tind,tags,n_tags,form_frequencies);
   if (tags!=NULL) {
      /* If necessary, we free the tags we created */
      for (int count_tags=0;count_tags<n_tags;count_tags++) {
         free(tags[count_tags]);
      }
      free(tags);
   }
}

u_printf("Text automaton rebuilt.\n");

u_fclose(f_tind);
u_fclose(f_tfst);
close_text_automaton(tfst);

/* Finally, we save statistics */
if (save_statistics) {
    char tfst_tags_by_freq[FILENAME_MAX];
    char tfst_tags_by_alph[FILENAME_MAX];
    strcpy(tfst_tags_by_freq, basedir);
    strcat(tfst_tags_by_freq, "tfst_tags_by_freq.txt");
    strcpy(tfst_tags_by_alph, basedir);
    strcat(tfst_tags_by_alph, "tfst_tags_by_alph.txt");
    U_FILE* f_tfst_tags_by_freq = u_fopen(&vec, tfst_tags_by_freq, U_WRITE);
    if (f_tfst_tags_by_freq == NULL) {
        error("Cannot open %s\n", tfst_tags_by_freq);
    }
    U_FILE* f_tfst_tags_by_alph = u_fopen(&vec, tfst_tags_by_alph, U_WRITE);
    if (f_tfst_tags_by_alph == NULL) {
        error("Cannot open %s\n", tfst_tags_by_alph);
    }
    sort_and_save_tfst_stats(form_frequencies, f_tfst_tags_by_freq, f_tfst_tags_by_alph);
    u_fclose(f_tfst_tags_by_freq);
    u_fclose(f_tfst_tags_by_alph);
}
free_hash_table(form_frequencies);

/* make a backup and replace old automaton with new */
char backup_tfst[FILENAME_MAX];
char backup_tind[FILENAME_MAX];
sprintf(backup_tfst,"%s.bck",input_tfst);
sprintf(backup_tind,"%s.bck",input_tind);
/* We remove the existing backup files, if any */
af_remove(backup_tfst);
af_remove(backup_tind);
af_rename(input_tfst,backup_tfst);
af_rename(input_tind,backup_tind);
af_rename(output_tfst,input_tfst);
af_rename(output_tind,input_tind);
u_printf("\nYou can find a backup of the original files in:\n    %s\nand %s\n",
         backup_tfst,backup_tind);

return SUCCESS_RETURN_CODE;
}