/**
 * Frees all the memory associated to the given rule, except its
 * automaton.
 */
void free_elRule(elRule* rule) {
if (rule==NULL) return;
if (rule->name!=NULL) free(rule->name);
for (int i=0;i<rule->nbContexts;i++) {
   free_SingleGraph(rule->contexts[i].right,free_symbol);
   free_SingleGraph(rule->contexts[i].left,free_symbol);
}
free(rule->contexts);
free_Fst2Automaton(rule->automaton,free_symbol);
free(rule);
}
Пример #2
0
/**
 * Note that we don't free the fst2 field, since we did not create it.
 */
static void free_GrfCheckInfo(GrfCheckInfo* info) {
if (info==NULL) {
	return;
}
free(info->graphs_matching_E);
if (info->condition_graphs!=NULL) {
	for (int i=1;i<info->fst2->number_of_graphs+1;i++) {
		free_SingleGraph(info->condition_graphs[i],NULL);
	}
	free(info->condition_graphs);
}
free(info);
}
/**
 * This function reads a file that contains a list of Elag grammar names,
 * and it compiles them into the file 'outname'. However, if the result
 * automaton is too big, it will be saved in several automata inside
 * the output file.
 */
int compile_elag_rules(char* rulesname,char* outname, const VersatileEncodingConfig* vec,language_t* language) {
u_printf("Compilation of %s\n",rulesname);
U_FILE* f=NULL;
U_FILE* frules=u_fopen(ASCII,rulesname,U_READ);
if (frules==NULL) {
   fatal_error("Cannot open file '%s'\n",rulesname);
}
U_FILE* out=u_fopen(ASCII,outname,U_WRITE);
if (out==NULL) {
   fatal_error("cannot open file '%s'\n",outname);
}
/* Name of the file that contains the result automaton */
char fstoutname[FILENAME_MAX];
int nbRules=0;
char buf[FILENAME_MAX];
time_t start_time=time(0);
Fst2Automaton* res=NULL;
Fst2Automaton* A;
int fst_number=0;
Ustring* ustr=new_Ustring();

char buf2[FILENAME_MAX];
char directory[FILENAME_MAX];
get_path(rulesname,directory);

while (af_fgets(buf,FILENAME_MAX,frules->f)) {
   /* We read one by one the Elag grammar names in the .lst file */
   chomp(buf);
   if (*buf=='\0') {
      /* If we have an empty line */
      continue;
   }
   if (!is_absolute_path(buf)) {
      strcpy(buf2,buf);
      sprintf(buf,"%s%s",directory,buf2);
   }

   u_printf("\n%s...\n",buf);
   remove_extension(buf);
   strcat(buf,".elg");
   if ((f=u_fopen(ASCII,buf,U_READ))==NULL) {
      /* If the .elg file doesn't exist, we create one */
      remove_extension(buf);
      u_printf("Precompiling %s.fst2\n",buf);
      strcat(buf,".fst2");
      elRule* rule=new_elRule(buf,vec,language);
      if (rule==NULL) {
         fatal_error("Unable to read grammar '%s'\n",buf);
      }
      if ((A=compile_elag_rule(rule,language))==NULL) {
         fatal_error("Unable to compile rule '%s'\n",buf);
      }
      free_elRule(rule);
   } else {
      /* If there is already .elg, we use it */
      u_fclose(f);
      A=load_elag_grammar_automaton(vec,buf,language);
      if (A==NULL) {
         fatal_error("Unable to load '%s'\n",buf);
      }
   }
   if (A->automaton->number_of_states==0) {
      error("Grammar %s forbids everything!\n",buf);
   }
   if (res!=NULL) {
      /* If there is already an automaton, we intersect it with the new one */
      SingleGraph tmp=res->automaton;
      res->automaton=elag_intersection(language,tmp,A->automaton,GRAMMAR_GRAMMAR);
      free_SingleGraph(tmp,NULL);
      free_Fst2Automaton(A,NULL);
      trim(res->automaton,NULL);
   } else {
      res=A;
   }
   nbRules++;
   if (res->automaton->number_of_states>MAX_GRAM_SIZE) {
      /* If the automaton is too large, we will split the grammar
       * into several automata */
      elag_minimize(res->automaton,1);
      sprintf(fstoutname,"%s-%d.elg",outname,fst_number++);
      u_fprintf(out,"<%s>\n",fstoutname);
      u_printf("Splitting big grammar in '%s' (%d states)\n",fstoutname,res->automaton->number_of_states);
      u_sprintf(ustr,"%s: compiled elag grammar",fstoutname);
      free(res->name);
      res->name=u_strdup(ustr->str);
      save_automaton(res,fstoutname,vec,FST_GRAMMAR);
      free_Fst2Automaton(res,NULL);
      res=NULL;
   }
}
if (res!=NULL) {
   /* We save the last automaton, if any */
   sprintf(fstoutname,"%s-%d.elg",outname,fst_number++);
   u_fprintf(out,"<%s>\n",fstoutname);
   u_printf("Saving grammar in '%s'(%d states)\n",fstoutname,res->automaton->number_of_states);
   elag_minimize(res->automaton,1);
   u_sprintf(ustr,"%s: compiled elag grammar",fstoutname);
   free(res->name);
   res->name=u_strdup(ustr->str);
   save_automaton(res,fstoutname,vec,FST_GRAMMAR);
   free_Fst2Automaton(res,free_symbol);
}
time_t end_time=time(0);
u_fclose(frules);
u_fclose(out);
free_Ustring(ustr);
u_printf("\nDone.\nElapsed time: %.0f s\n",difftime(end_time,start_time));
u_printf("\n%d rule%s from %s compiled in %s (%d automat%s)\n",
         nbRules,(nbRules>1)?"s":"",rulesname,outname,fst_number,
         (fst_number>1)?"a":"on");
return 0;
}
Пример #4
0
int main_RebuildTfst(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val, index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
int save_statistics=1;
while (EOF!=(val=options.parse_long(argc,argv,optstring_RebuildTfst,lopts_RebuildTfst,&index))) {
   switch (val) {
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'S': save_statistics = 0;
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h':
      usage();
      return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n", options.vars()->optopt) :
                         error("Missing argument for option --%s\n", lopts_RebuildTfst[index].name);
     return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n", options.vars()->optopt) :
                         error("Invalid option --%s\n", options.vars()->optarg);
     return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char input_tfst[FILENAME_MAX];
char input_tind[FILENAME_MAX];
strcpy(input_tfst,argv[options.vars()->optind]);
remove_extension(input_tfst,input_tind);
strcat(input_tind,".tind");

u_printf("Loading %s...\n",input_tfst);

Tfst* tfst = open_text_automaton(&vec,input_tfst);
if (tfst==NULL) {
   error("Unable to load %s automaton\n",input_tfst);
   return DEFAULT_ERROR_CODE;
}

char basedir[FILENAME_MAX];
get_path(input_tfst,basedir);
char output_tfst[FILENAME_MAX];
sprintf(output_tfst, "%s.new.tfst",input_tfst);
char output_tind[FILENAME_MAX];
sprintf(output_tind, "%s.new.tind",input_tfst);

U_FILE* f_tfst;
if ((f_tfst = u_fopen(&vec,output_tfst,U_WRITE)) == NULL) {
   error("Unable to open %s for writing\n", output_tfst);
   close_text_automaton(tfst);
   return DEFAULT_ERROR_CODE;
}

U_FILE* f_tind;
if ((f_tind = u_fopen(BINARY,output_tind,U_WRITE)) == NULL) {
   u_fclose(f_tfst);
   close_text_automaton(tfst);
   error("Unable to open %s for writing\n", output_tind);
   return DEFAULT_ERROR_CODE;
}
/* We use this hash table to rebuild files tfst_tags_by_freq/alph.txt */
struct hash_table* form_frequencies=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal,
        (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy);

u_fprintf(f_tfst,"%010d\n",tfst->N);
for (int i = 1; i <= tfst->N; i++) {
   if ((i % 100) == 0) {
      u_printf("%d/%d sentences rebuilt...\n", i, tfst->N);
   }
   load_sentence(tfst,i);

   char grfname[FILENAME_MAX];
   sprintf(grfname, "%ssentence%d.grf", basedir, i);
   unichar** tags=NULL;
   int n_tags=-1;
   if (fexists(grfname)) {
      /* If there is a .grf for the current sentence, then we must
       * take it into account */
      if (0==pseudo_main_Grf2Fst2(&vec,grfname,0,NULL,1,1,NULL,NULL,0)) {
         /* We proceed only if the graph compilation was a success */
         char fst2name[FILENAME_MAX];
         sprintf(fst2name, "%ssentence%d.fst2", basedir, i);
         struct FST2_free_info fst2_free;
         Fst2* fst2=load_abstract_fst2(&vec,fst2name,0,&fst2_free);
         af_remove(fst2name);
         free_SingleGraph(tfst->automaton,NULL);
         tfst->automaton=create_copy_of_fst2_subgraph(fst2,1);
         tags=create_tfst_tags(fst2,&n_tags);
         free_abstract_Fst2(fst2,&fst2_free);
      } else {
         error("Error: %s is not a valid sentence automaton\n",grfname);
      }
   }
   save_current_sentence(tfst,f_tfst,f_tind,tags,n_tags,form_frequencies);
   if (tags!=NULL) {
      /* If necessary, we free the tags we created */
      for (int count_tags=0;count_tags<n_tags;count_tags++) {
         free(tags[count_tags]);
      }
      free(tags);
   }
}

u_printf("Text automaton rebuilt.\n");

u_fclose(f_tind);
u_fclose(f_tfst);
close_text_automaton(tfst);

/* Finally, we save statistics */
if (save_statistics) {
    char tfst_tags_by_freq[FILENAME_MAX];
    char tfst_tags_by_alph[FILENAME_MAX];
    strcpy(tfst_tags_by_freq, basedir);
    strcat(tfst_tags_by_freq, "tfst_tags_by_freq.txt");
    strcpy(tfst_tags_by_alph, basedir);
    strcat(tfst_tags_by_alph, "tfst_tags_by_alph.txt");
    U_FILE* f_tfst_tags_by_freq = u_fopen(&vec, tfst_tags_by_freq, U_WRITE);
    if (f_tfst_tags_by_freq == NULL) {
        error("Cannot open %s\n", tfst_tags_by_freq);
    }
    U_FILE* f_tfst_tags_by_alph = u_fopen(&vec, tfst_tags_by_alph, U_WRITE);
    if (f_tfst_tags_by_alph == NULL) {
        error("Cannot open %s\n", tfst_tags_by_alph);
    }
    sort_and_save_tfst_stats(form_frequencies, f_tfst_tags_by_freq, f_tfst_tags_by_alph);
    u_fclose(f_tfst_tags_by_freq);
    u_fclose(f_tfst_tags_by_alph);
}
free_hash_table(form_frequencies);

/* make a backup and replace old automaton with new */
char backup_tfst[FILENAME_MAX];
char backup_tind[FILENAME_MAX];
sprintf(backup_tfst,"%s.bck",input_tfst);
sprintf(backup_tind,"%s.bck",input_tind);
/* We remove the existing backup files, if any */
af_remove(backup_tfst);
af_remove(backup_tind);
af_rename(input_tfst,backup_tfst);
af_rename(input_tind,backup_tind);
af_rename(output_tfst,input_tfst);
af_rename(output_tind,input_tind);
u_printf("\nYou can find a backup of the original files in:\n    %s\nand %s\n",
         backup_tfst,backup_tind);

return SUCCESS_RETURN_CODE;
}
Пример #5
0
static void rebuild_condition_graphs(GrfCheckInfo* chk) {
for (int i=1;i<chk->fst2->number_of_graphs+1;i++) {
	free_SingleGraph(chk->condition_graphs[i],NULL);
	chk->condition_graphs[i]=create_condition_graph(chk->fst2,i,0);
}
}
Пример #6
0
/**
 * Frees all the memory associated to the given automaton, except
 * the symbols.
 */
void free_Fst2Automaton(Fst2Automaton* A,void (*free_elag_symbol)(symbol_t*)) {
if (A==NULL) return;
if (A->name!=NULL) free(A->name);
free_SingleGraph(A->automaton,free_elag_symbol);
free(A);
}