/**
 * Returns 1 if the given .fst2 is OK to be used by the Locate program; 0 otherwise.
 * Conditions are:
 *
 * 1) no left recursion
 * 2) no loop that can recognize the empty word (<E> with an output or subgraph
 *    that can match the empty word).
 */
int OK_for_Locate_write_error(const VersatileEncodingConfig* vec,const char* name,char no_empty_graph_warning,U_FILE* ferr) {
int RESULT=1;
struct FST2_free_info fst2_free;
Fst2* fst2=load_abstract_fst2(vec,name,1,&fst2_free);
if (fst2==NULL) {
	fatal_error("Cannot load graph %s\n",name);
}
u_printf("Creating condition sets...\n");
GrfCheckInfo* chk=new_GrfCheckInfo(fst2);
/* Now, we look for a fix point in the condition graphs */
struct list_int* list=NULL;
/* To do that, we start by creating a list of all the graphs we are sure about */
int unknown=0;
for (int i=1;i<fst2->number_of_graphs+1;i++) {
	if (chk->graphs_matching_E[i]!=CHK_DONT_KNOW) {
		list=new_list_int(i,list);
	} else {
		unknown++;
	}
}
/* While there is something to do for E matching */
u_printf("Checking empty word matching...\n");
while (resolve_all_conditions(chk,&list,&unknown)) {}
if (chk->graphs_matching_E[1]==CHK_MATCHES_E) {
	if (!no_empty_graph_warning) {
       error("ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]);
       if (ferr!=NULL) {
    	   u_fprintf(ferr,"ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]);
	   }
	}
	goto evil_goto;
}
if (!no_empty_graph_warning) {
	for (int i=2;i<fst2->number_of_graphs+1;i++) {
		if (chk->graphs_matching_E[i]==CHK_MATCHES_E) {
			error("WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]);
			if (ferr!=NULL) {
				u_fprintf(ferr,"WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]);
			}
		}
	}
}
/* Now, we look for E loops and left recursions. And to do that, we need a new version
 * of the condition graphs, because a graph that does not match E would have been emptied.
 * And obviously, we can not deduce anything from an empty graph. */
rebuild_condition_graphs(chk);
u_printf("Checking E loops...\n");
if (is_any_E_loop(chk)) {
	/* Error messages have already been printed */
	goto evil_goto;
}
u_printf("Checking left recursions...\n");
if (is_any_left_recursion(chk)) {
	/* Error messages have already been printed */
	goto evil_goto;
}
evil_goto:
/* There may be something unused in the list that we need to free */
free_list_int(list);
free_GrfCheckInfo(chk);
free_abstract_Fst2(fst2,&fst2_free);
return RESULT;
}
示例#2
0
int main_RebuildTfst(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val, index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_RebuildTfst,lopts_RebuildTfst,&index))) {
   switch (val) {
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;             
   case 'h':
      usage();
      return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n", options.vars()->optopt) :
                         error("Missing argument for option --%s\n", lopts_RebuildTfst[index].name);
     return USAGE_ERROR_CODE;                    
   case '?': index==-1 ? error("Invalid option -%c\n", options.vars()->optopt) :
                         error("Invalid option --%s\n", options.vars()->optarg);
     return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char input_tfst[FILENAME_MAX];
char input_tind[FILENAME_MAX];
strcpy(input_tfst,argv[options.vars()->optind]);
remove_extension(input_tfst,input_tind);
strcat(input_tind,".tind");

u_printf("Loading %s...\n",input_tfst);

Tfst* tfst = open_text_automaton(&vec,input_tfst);
if (tfst==NULL) {
   error("Unable to load %s automaton\n",input_tfst);
   return DEFAULT_ERROR_CODE;
}

char basedir[FILENAME_MAX];
get_path(input_tfst,basedir);
char output_tfst[FILENAME_MAX];
sprintf(output_tfst, "%s.new.tfst",input_tfst);
char output_tind[FILENAME_MAX];
sprintf(output_tind, "%s.new.tind",input_tfst);

U_FILE* f_tfst;
if ((f_tfst = u_fopen(&vec,output_tfst,U_WRITE)) == NULL) {
   error("Unable to open %s for writing\n", output_tfst);
   close_text_automaton(tfst);
   return DEFAULT_ERROR_CODE;
}

U_FILE* f_tind;
if ((f_tind = u_fopen(BINARY,output_tind,U_WRITE)) == NULL) {
   u_fclose(f_tfst);
   close_text_automaton(tfst);
   error("Unable to open %s for writing\n", output_tind);
   return DEFAULT_ERROR_CODE;
}
/* We use this hash table to rebuild files tfst_tags_by_freq/alph.txt */
struct hash_table* form_frequencies=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal,
        (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy);

u_fprintf(f_tfst,"%010d\n",tfst->N);
for (int i = 1; i <= tfst->N; i++) {
   if ((i % 100) == 0) {
      u_printf("%d/%d sentences rebuilt...\n", i, tfst->N);
   }
   load_sentence(tfst,i);

   char grfname[FILENAME_MAX];
   sprintf(grfname, "%ssentence%d.grf", basedir, i);
   unichar** tags=NULL;
   int n_tags=-1;
   if (fexists(grfname)) {
      /* If there is a .grf for the current sentence, then we must
       * take it into account */
      if (0==pseudo_main_Grf2Fst2(&vec,grfname,0,NULL,1,1,NULL,NULL,0)) {
         /* We proceed only if the graph compilation was a success */
         char fst2name[FILENAME_MAX];
         sprintf(fst2name, "%ssentence%d.fst2", basedir, i);
         struct FST2_free_info fst2_free;
         Fst2* fst2=load_abstract_fst2(&vec,fst2name,0,&fst2_free);
         af_remove(fst2name);
         free_SingleGraph(tfst->automaton,NULL);
         tfst->automaton=create_copy_of_fst2_subgraph(fst2,1);
         tags=create_tfst_tags(fst2,&n_tags);
         free_abstract_Fst2(fst2,&fst2_free);
      } else {
         error("Error: %s is not a valid sentence automaton\n",grfname);
      }
   }
   save_current_sentence(tfst,f_tfst,f_tind,tags,n_tags,form_frequencies);
   if (tags!=NULL) {
      /* If necessary, we free the tags we created */
      for (int count_tags=0;count_tags<n_tags;count_tags++) {
         free(tags[count_tags]);
      }
      free(tags);
   }
}

u_printf("Text automaton rebuilt.\n");

u_fclose(f_tind);
u_fclose(f_tfst);
close_text_automaton(tfst);

/* Finally, we save statistics */
char tfst_tags_by_freq[FILENAME_MAX];
char tfst_tags_by_alph[FILENAME_MAX];
strcpy(tfst_tags_by_freq,basedir);
strcat(tfst_tags_by_freq,"tfst_tags_by_freq.txt");
strcpy(tfst_tags_by_alph,basedir);
strcat(tfst_tags_by_alph,"tfst_tags_by_alph.txt");
U_FILE* f_tfst_tags_by_freq=u_fopen(&vec,tfst_tags_by_freq,U_WRITE);
if (f_tfst_tags_by_freq==NULL) {
	error("Cannot open %s\n",tfst_tags_by_freq);
}
U_FILE* f_tfst_tags_by_alph=u_fopen(&vec,tfst_tags_by_alph,U_WRITE);
if (f_tfst_tags_by_alph==NULL) {
	error("Cannot open %s\n",tfst_tags_by_alph);
}
sort_and_save_tfst_stats(form_frequencies,f_tfst_tags_by_freq,f_tfst_tags_by_alph);
u_fclose(f_tfst_tags_by_freq);
u_fclose(f_tfst_tags_by_alph);
free_hash_table(form_frequencies);

/* make a backup and replace old automaton with new */
char backup_tfst[FILENAME_MAX];
char backup_tind[FILENAME_MAX];
sprintf(backup_tfst,"%s.bck",input_tfst);
sprintf(backup_tind,"%s.bck",input_tind);
/* We remove the existing backup files, if any */
af_remove(backup_tfst);
af_remove(backup_tind);
af_rename(input_tfst,backup_tfst);
af_rename(input_tind,backup_tind);
af_rename(output_tfst,input_tfst);
af_rename(output_tind,input_tind);
u_printf("\nYou can find a backup of the original files in:\n    %s\nand %s\n",
         backup_tfst,backup_tind);

return SUCCESS_RETURN_CODE;
}
示例#3
0
/**
 * Returns 1 if the given .fst2 is OK to be used by the Locate program; 0 otherwise. 
 * Conditions are:
 * 
 * 1) no left recursion
 * 2) no loop that can recognize the empty word (<E> with an output or subgraph
 *    that can match the empty word).
 */
int OK_for_Locate_write_error(const char* name,char no_empty_graph_warning,U_FILE* ferr) {
ConditionList* conditions;
ConditionList* conditions_for_state;
int i,j;
int ERROR=0;
struct FST2_free_info fst2_free;
Fst2* fst2=load_abstract_fst2(name,1,&fst2_free);
if (fst2==NULL) {
	fatal_error("Cannot load graph %s\n",name);
}
u_printf("Recursion detection started\n");
int* graphs_matching_E=(int*)malloc(sizeof(int)*(fst2->number_of_graphs+1));
conditions=(ConditionList*)malloc(sizeof(ConditionList)*(fst2->number_of_graphs+1));
if (graphs_matching_E==NULL || conditions==NULL) {
   fatal_alloc_error("OK_for_Locate");
}
for (i=0;i<fst2->number_of_graphs+1;i++) {
   graphs_matching_E[i]=0;
   conditions[i]=NULL;
}
/* First, we look for tags that match the empty word <E> */
for (i=0;i<fst2->number_of_tags;i++) {
   check_epsilon_tag(fst2->tags[i]);
}
/* Then, we look for graphs that match <E> with or without conditions */
for (i=1;i<=fst2->number_of_graphs;i++) {
   conditions_for_state=(ConditionList*)malloc(sizeof(ConditionList)*fst2->number_of_states_per_graphs[i]);
   if (conditions_for_state==NULL) {
      fatal_alloc_error("OK_for_Locate");
   }
   for (j=0;j<fst2->number_of_states_per_graphs[i];j++) {
      conditions_for_state[j]=NULL;
   }
   graphs_matching_E[i]=graph_matches_E(fst2->initial_states[i],fst2->initial_states[i],
  								      fst2->states,fst2->tags,i,fst2->graph_names,
  								      conditions_for_state,&conditions[i]);
   /* If any, we remove the temp conditions */                        
   if (conditions[i]!=NULL) free_ConditionList(conditions[i]);
   /* And we way that the conditions for the current graph are its initial
    * state's ones. */
   conditions[i]=conditions_for_state[0];
   /* Then we perform cleaning */
   conditions_for_state[0]=NULL;
   for (j=1;j<fst2->number_of_states_per_graphs[i];j++) {
      free_ConditionList(conditions_for_state[j]);
   }
   free(conditions_for_state); 
}
/* Then, we use all our condition lists to determine which graphs match <E>.
 * We iterate until we find a fixed point. If some conditions remain non null
 * after this loop, it means that there are <E> dependencies between graphs
 * and this case will be dealt with later. */
u_printf("Resolving <E> conditions\n");
while (resolve_conditions(conditions,fst2->number_of_graphs,
							fst2->states,fst2->initial_states,ferr)) {}
if (is_bit_mask_set(fst2->states[fst2->initial_states[1]]->control,UNCONDITIONAL_E_MATCH)) {
   /* If the main graph matches <E> */
   if (!no_empty_graph_warning) {
       error("ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]);
       if (ferr != NULL)
         u_fprintf(ferr,"ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]);
   }
   ERROR=1;
}
if (!ERROR) {
   for (i=1;i<fst2->number_of_graphs+1;i++) {
      if (is_bit_mask_set(fst2->states[fst2->initial_states[i]]->control,UNCONDITIONAL_E_MATCH)) {
         /* If the graph matches <E> */
         if (!no_empty_graph_warning) {
             error("WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]);
             if (ferr != NULL)
                u_fprintf(ferr,"WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]);
         }
      }
   }
}
clean_controls(fst2,graphs_matching_E);
if (!ERROR) {
   u_printf("Looking for <E> loops\n");
   for (i=1;!ERROR && i<fst2->number_of_graphs+1;i++) {
      ERROR=look_for_E_loops(i,fst2,graphs_matching_E,ferr);
   }
}
clean_controls(fst2,NULL);
if (!ERROR) {
   u_printf("Looking for infinite recursions\n");
   for (i=1;!ERROR && i<fst2->number_of_graphs+1;i++) {
      ERROR=look_for_recursion(i,NULL,fst2,graphs_matching_E,ferr);
   }
}
for (i=1;i<fst2->number_of_graphs+1;i++) {
   free_ConditionList(conditions[i]);
}
free_abstract_Fst2(fst2,&fst2_free);
u_printf("Recursion detection completed\n");
free(conditions);
free(graphs_matching_E);
if (ERROR) return LEFT_RECURSION;
return NO_LEFT_RECURSION;
}