/** * Returns 1 if the given .fst2 is OK to be used by the Locate program; 0 otherwise. * Conditions are: * * 1) no left recursion * 2) no loop that can recognize the empty word (<E> with an output or subgraph * that can match the empty word). */ int OK_for_Locate_write_error(const VersatileEncodingConfig* vec,const char* name,char no_empty_graph_warning,U_FILE* ferr) { int RESULT=1; struct FST2_free_info fst2_free; Fst2* fst2=load_abstract_fst2(vec,name,1,&fst2_free); if (fst2==NULL) { fatal_error("Cannot load graph %s\n",name); } u_printf("Creating condition sets...\n"); GrfCheckInfo* chk=new_GrfCheckInfo(fst2); /* Now, we look for a fix point in the condition graphs */ struct list_int* list=NULL; /* To do that, we start by creating a list of all the graphs we are sure about */ int unknown=0; for (int i=1;i<fst2->number_of_graphs+1;i++) { if (chk->graphs_matching_E[i]!=CHK_DONT_KNOW) { list=new_list_int(i,list); } else { unknown++; } } /* While there is something to do for E matching */ u_printf("Checking empty word matching...\n"); while (resolve_all_conditions(chk,&list,&unknown)) {} if (chk->graphs_matching_E[1]==CHK_MATCHES_E) { if (!no_empty_graph_warning) { error("ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]); if (ferr!=NULL) { u_fprintf(ferr,"ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]); } } goto evil_goto; } if (!no_empty_graph_warning) { for (int i=2;i<fst2->number_of_graphs+1;i++) { if (chk->graphs_matching_E[i]==CHK_MATCHES_E) { error("WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]); if (ferr!=NULL) { u_fprintf(ferr,"WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]); } } } } /* Now, we look for E loops and left recursions. And to do that, we need a new version * of the condition graphs, because a graph that does not match E would have been emptied. * And obviously, we can not deduce anything from an empty graph. */ rebuild_condition_graphs(chk); u_printf("Checking E loops...\n"); if (is_any_E_loop(chk)) { /* Error messages have already been printed */ goto evil_goto; } u_printf("Checking left recursions...\n"); if (is_any_left_recursion(chk)) { /* Error messages have already been printed */ goto evil_goto; } evil_goto: /* There may be something unused in the list that we need to free */ free_list_int(list); free_GrfCheckInfo(chk); free_abstract_Fst2(fst2,&fst2_free); return RESULT; }
int main_RebuildTfst(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } VersatileEncodingConfig vec=VEC_DEFAULT; int val, index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_RebuildTfst,lopts_RebuildTfst,&index))) { switch (val) { case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n", options.vars()->optopt) : error("Missing argument for option --%s\n", lopts_RebuildTfst[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n", options.vars()->optopt) : error("Invalid option --%s\n", options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } char input_tfst[FILENAME_MAX]; char input_tind[FILENAME_MAX]; strcpy(input_tfst,argv[options.vars()->optind]); remove_extension(input_tfst,input_tind); strcat(input_tind,".tind"); u_printf("Loading %s...\n",input_tfst); Tfst* tfst = open_text_automaton(&vec,input_tfst); if (tfst==NULL) { error("Unable to load %s automaton\n",input_tfst); return DEFAULT_ERROR_CODE; } char basedir[FILENAME_MAX]; get_path(input_tfst,basedir); char output_tfst[FILENAME_MAX]; sprintf(output_tfst, "%s.new.tfst",input_tfst); char output_tind[FILENAME_MAX]; sprintf(output_tind, "%s.new.tind",input_tfst); U_FILE* f_tfst; if ((f_tfst = u_fopen(&vec,output_tfst,U_WRITE)) == NULL) { error("Unable to open %s for writing\n", output_tfst); close_text_automaton(tfst); return DEFAULT_ERROR_CODE; } U_FILE* f_tind; if ((f_tind = u_fopen(BINARY,output_tind,U_WRITE)) == NULL) { u_fclose(f_tfst); close_text_automaton(tfst); error("Unable to open %s for writing\n", output_tind); return DEFAULT_ERROR_CODE; } /* We use this hash table to rebuild files tfst_tags_by_freq/alph.txt */ struct hash_table* form_frequencies=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal, (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy); u_fprintf(f_tfst,"%010d\n",tfst->N); for (int i = 1; i <= tfst->N; i++) { if ((i % 100) == 0) { u_printf("%d/%d sentences rebuilt...\n", i, tfst->N); } load_sentence(tfst,i); char grfname[FILENAME_MAX]; sprintf(grfname, "%ssentence%d.grf", basedir, i); unichar** tags=NULL; int n_tags=-1; if (fexists(grfname)) { /* If there is a .grf for the current sentence, then we must * take it into account */ if (0==pseudo_main_Grf2Fst2(&vec,grfname,0,NULL,1,1,NULL,NULL,0)) { /* We proceed only if the graph compilation was a success */ char fst2name[FILENAME_MAX]; sprintf(fst2name, "%ssentence%d.fst2", basedir, i); struct FST2_free_info fst2_free; Fst2* fst2=load_abstract_fst2(&vec,fst2name,0,&fst2_free); af_remove(fst2name); free_SingleGraph(tfst->automaton,NULL); tfst->automaton=create_copy_of_fst2_subgraph(fst2,1); tags=create_tfst_tags(fst2,&n_tags); free_abstract_Fst2(fst2,&fst2_free); } else { error("Error: %s is not a valid sentence automaton\n",grfname); } } save_current_sentence(tfst,f_tfst,f_tind,tags,n_tags,form_frequencies); if (tags!=NULL) { /* If necessary, we free the tags we created */ for (int count_tags=0;count_tags<n_tags;count_tags++) { free(tags[count_tags]); } free(tags); } } u_printf("Text automaton rebuilt.\n"); u_fclose(f_tind); u_fclose(f_tfst); close_text_automaton(tfst); /* Finally, we save statistics */ char tfst_tags_by_freq[FILENAME_MAX]; char tfst_tags_by_alph[FILENAME_MAX]; strcpy(tfst_tags_by_freq,basedir); strcat(tfst_tags_by_freq,"tfst_tags_by_freq.txt"); strcpy(tfst_tags_by_alph,basedir); strcat(tfst_tags_by_alph,"tfst_tags_by_alph.txt"); U_FILE* f_tfst_tags_by_freq=u_fopen(&vec,tfst_tags_by_freq,U_WRITE); if (f_tfst_tags_by_freq==NULL) { error("Cannot open %s\n",tfst_tags_by_freq); } U_FILE* f_tfst_tags_by_alph=u_fopen(&vec,tfst_tags_by_alph,U_WRITE); if (f_tfst_tags_by_alph==NULL) { error("Cannot open %s\n",tfst_tags_by_alph); } sort_and_save_tfst_stats(form_frequencies,f_tfst_tags_by_freq,f_tfst_tags_by_alph); u_fclose(f_tfst_tags_by_freq); u_fclose(f_tfst_tags_by_alph); free_hash_table(form_frequencies); /* make a backup and replace old automaton with new */ char backup_tfst[FILENAME_MAX]; char backup_tind[FILENAME_MAX]; sprintf(backup_tfst,"%s.bck",input_tfst); sprintf(backup_tind,"%s.bck",input_tind); /* We remove the existing backup files, if any */ af_remove(backup_tfst); af_remove(backup_tind); af_rename(input_tfst,backup_tfst); af_rename(input_tind,backup_tind); af_rename(output_tfst,input_tfst); af_rename(output_tind,input_tind); u_printf("\nYou can find a backup of the original files in:\n %s\nand %s\n", backup_tfst,backup_tind); return SUCCESS_RETURN_CODE; }
/** * Returns 1 if the given .fst2 is OK to be used by the Locate program; 0 otherwise. * Conditions are: * * 1) no left recursion * 2) no loop that can recognize the empty word (<E> with an output or subgraph * that can match the empty word). */ int OK_for_Locate_write_error(const char* name,char no_empty_graph_warning,U_FILE* ferr) { ConditionList* conditions; ConditionList* conditions_for_state; int i,j; int ERROR=0; struct FST2_free_info fst2_free; Fst2* fst2=load_abstract_fst2(name,1,&fst2_free); if (fst2==NULL) { fatal_error("Cannot load graph %s\n",name); } u_printf("Recursion detection started\n"); int* graphs_matching_E=(int*)malloc(sizeof(int)*(fst2->number_of_graphs+1)); conditions=(ConditionList*)malloc(sizeof(ConditionList)*(fst2->number_of_graphs+1)); if (graphs_matching_E==NULL || conditions==NULL) { fatal_alloc_error("OK_for_Locate"); } for (i=0;i<fst2->number_of_graphs+1;i++) { graphs_matching_E[i]=0; conditions[i]=NULL; } /* First, we look for tags that match the empty word <E> */ for (i=0;i<fst2->number_of_tags;i++) { check_epsilon_tag(fst2->tags[i]); } /* Then, we look for graphs that match <E> with or without conditions */ for (i=1;i<=fst2->number_of_graphs;i++) { conditions_for_state=(ConditionList*)malloc(sizeof(ConditionList)*fst2->number_of_states_per_graphs[i]); if (conditions_for_state==NULL) { fatal_alloc_error("OK_for_Locate"); } for (j=0;j<fst2->number_of_states_per_graphs[i];j++) { conditions_for_state[j]=NULL; } graphs_matching_E[i]=graph_matches_E(fst2->initial_states[i],fst2->initial_states[i], fst2->states,fst2->tags,i,fst2->graph_names, conditions_for_state,&conditions[i]); /* If any, we remove the temp conditions */ if (conditions[i]!=NULL) free_ConditionList(conditions[i]); /* And we way that the conditions for the current graph are its initial * state's ones. */ conditions[i]=conditions_for_state[0]; /* Then we perform cleaning */ conditions_for_state[0]=NULL; for (j=1;j<fst2->number_of_states_per_graphs[i];j++) { free_ConditionList(conditions_for_state[j]); } free(conditions_for_state); } /* Then, we use all our condition lists to determine which graphs match <E>. * We iterate until we find a fixed point. If some conditions remain non null * after this loop, it means that there are <E> dependencies between graphs * and this case will be dealt with later. */ u_printf("Resolving <E> conditions\n"); while (resolve_conditions(conditions,fst2->number_of_graphs, fst2->states,fst2->initial_states,ferr)) {} if (is_bit_mask_set(fst2->states[fst2->initial_states[1]]->control,UNCONDITIONAL_E_MATCH)) { /* If the main graph matches <E> */ if (!no_empty_graph_warning) { error("ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]); if (ferr != NULL) u_fprintf(ferr,"ERROR: the main graph %S recognizes <E>\n",fst2->graph_names[1]); } ERROR=1; } if (!ERROR) { for (i=1;i<fst2->number_of_graphs+1;i++) { if (is_bit_mask_set(fst2->states[fst2->initial_states[i]]->control,UNCONDITIONAL_E_MATCH)) { /* If the graph matches <E> */ if (!no_empty_graph_warning) { error("WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]); if (ferr != NULL) u_fprintf(ferr,"WARNING: the graph %S recognizes <E>\n",fst2->graph_names[i]); } } } } clean_controls(fst2,graphs_matching_E); if (!ERROR) { u_printf("Looking for <E> loops\n"); for (i=1;!ERROR && i<fst2->number_of_graphs+1;i++) { ERROR=look_for_E_loops(i,fst2,graphs_matching_E,ferr); } } clean_controls(fst2,NULL); if (!ERROR) { u_printf("Looking for infinite recursions\n"); for (i=1;!ERROR && i<fst2->number_of_graphs+1;i++) { ERROR=look_for_recursion(i,NULL,fst2,graphs_matching_E,ferr); } } for (i=1;i<fst2->number_of_graphs+1;i++) { free_ConditionList(conditions[i]); } free_abstract_Fst2(fst2,&fst2_free); u_printf("Recursion detection completed\n"); free(conditions); free(graphs_matching_E); if (ERROR) return LEFT_RECURSION; return NO_LEFT_RECURSION; }