void get_csc_wd_path(const char* filename, char* result) { char canonical_name[FILENAME_MAX]; remove_path_and_extension(filename, canonical_name); char extension[FILENAME_MAX]; get_extension(filename, extension); get_path(filename, result); remove_path_and_extension(filename, result + strlen(result)); strcat(result, CASSYS_DIRECTORY_EXTENSION); strcat(result, PATH_SEPARATOR_STRING); sprintf(result+strlen(result), "%s_0_0%s",canonical_name,extension); }
int initialize_working_directory(const char *text,int must_create_directory){ char path[FILENAME_MAX]; get_path(text,path); char canonical_name[FILENAME_MAX]; remove_path_and_extension(text, canonical_name); char extension[FILENAME_MAX]; get_extension(text,extension); char working_directory[FILENAME_MAX]; sprintf(working_directory, "%s%s%s%c",path, canonical_name, CASSYS_DIRECTORY_EXTENSION, PATH_SEPARATOR_CHAR); if (must_create_directory != 0) { make_directory(working_directory); } char text_in_wd[FILENAME_MAX]; sprintf(text_in_wd, "%s%s_0%s",working_directory,canonical_name,extension ); copy_file(text_in_wd,text); char snt_dir_text_in_wd[FILENAME_MAX]; get_snt_path(text_in_wd, snt_dir_text_in_wd); if (must_create_directory != 0) { make_directory(snt_dir_text_in_wd); } char original_snt_dir[FILENAME_MAX]; get_snt_path(text,original_snt_dir); copy_directory_snt_content(snt_dir_text_in_wd, original_snt_dir); return 0; }
char* create_updated_graph_filename(const char *text, int next_transducer_label, int next_iteration, const char* graph_name, const char* ext) { char path[FILENAME_MAX]; get_path(text, path); char canonical_text_name[FILENAME_MAX]; remove_path_and_extension(text, canonical_text_name); char working[FILENAME_MAX]; sprintf(working, "%s%s%s%c%s_%d_%d_snt%c", path, canonical_text_name, CASSYS_DIRECTORY_EXTENSION, PATH_SEPARATOR_CHAR, canonical_text_name, next_transducer_label, next_iteration, PATH_SEPARATOR_CHAR); strcat(working, graph_name); strcat(working, ext); char* full_graph_name = (char*)malloc(sizeof(char)*(strlen(working) + 1)); if (graph_name == NULL) { fatal_alloc_error("create_updated_graph_filename"); exit(1); } strcpy(full_graph_name, working); return full_graph_name; }
void get_csc_path(const char* filename, char* result) { get_path(filename, result); remove_path_and_extension(filename, result + strlen(result)); strcat(result, CASSYS_DIRECTORY_EXTENSION); strcat(result, PATH_SEPARATOR_STRING); }
char* create_labeled_files_and_directory(const char *text, int next_transducer_label,int must_create_directory,int must_copy_file) { char path[FILENAME_MAX]; get_path(text, path); char canonical_text_name[FILENAME_MAX]; remove_path_and_extension(text, canonical_text_name); char extension[FILENAME_MAX]; get_extension(text, extension); char working_directory[FILENAME_MAX]; sprintf(working_directory, "%s%s%s%c", path, canonical_text_name, CASSYS_DIRECTORY_EXTENSION, PATH_SEPARATOR_CHAR); // copy the text label i- to i char old_labeled_text_name[FILENAME_MAX]; sprintf(old_labeled_text_name, "%s%s_%d%s", working_directory, canonical_text_name, next_transducer_label - 1, extension); char new_labeled_text_name[FILENAME_MAX]; sprintf(new_labeled_text_name, "%s%s_%d%s", working_directory, canonical_text_name, next_transducer_label, extension); char new_labeled_snt_directory[FILENAME_MAX]; get_snt_path(new_labeled_text_name, new_labeled_snt_directory); if (must_create_directory != 0) { make_directory(new_labeled_snt_directory); } if (must_copy_file != 0) { copy_file(new_labeled_text_name, old_labeled_text_name); // create snt directory labeled i char old_labeled_snt_directory[FILENAME_MAX]; get_snt_path(old_labeled_text_name, old_labeled_snt_directory); // copy dictionary files in the new snt directory struct snt_files *old_snt_ = new_snt_files(old_labeled_text_name); struct snt_files *new_snt_ = new_snt_files(new_labeled_text_name); if (fexists(old_snt_->dlc)) { copy_file(new_snt_->dlc, old_snt_->dlc); } if (fexists(old_snt_-> dlf)) { copy_file(new_snt_->dlf, old_snt_->dlf); } if (fexists(old_snt_-> err)) { copy_file(new_snt_->err, old_snt_->err); } if (fexists(old_snt_->dlc_n)) { copy_file(new_snt_->dlc_n, old_snt_->dlc_n); } if (fexists(old_snt_->dlf_n)) { copy_file(new_snt_->dlf_n, old_snt_->dlf_n); } if (fexists(old_snt_-> err_n)) { copy_file(new_snt_->err_n, old_snt_->err_n); } if (fexists(old_snt_->stat_dic_n)) { copy_file(new_snt_->stat_dic_n, old_snt_->stat_dic_n); } free_snt_files(old_snt_); free_snt_files(new_snt_); } char *labeled_text_name; labeled_text_name = (char*)malloc(sizeof(char)*(strlen(new_labeled_text_name)+1)); if(labeled_text_name == NULL){ perror("malloc\n"); fprintf(stderr,"Impossible to allocate memory\n"); exit(1); } strcpy(labeled_text_name, new_labeled_text_name); return labeled_text_name; }
int main_TrainingTagger(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int val,index=-1,binaries=1,r_forms=1,i_forms=1; int semitic=0; struct OptVars* vars=new_OptVars(); char text[FILENAME_MAX]=""; char raw_forms[FILENAME_MAX]=""; char inflected_forms[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; while (EOF!=(val=getopt_long_TS(argc,argv,optstring_TrainingTagger,lopts_TrainingTagger,&index,vars))) { switch(val) { case 'o': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty pattern\n"); } strcpy(output,vars->optarg); break; case 'b': binaries = 1; break; case 'n': binaries = 0; break; case 'a': break; case 'c': i_forms = 0; break; case 'm': r_forms = 0; break; case 'S': semitic=1; break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_TrainingTagger[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { free_OptVars(vars); error("Invalid arguments: rerun with --help\n"); return 1; } strcpy(text,argv[vars->optind]); U_FILE* input_text=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,text,U_READ); if (input_text==NULL) { free_OptVars(vars); fatal_error("cannot open file %s\n",text); return 1; } if(output[0]=='\0'){ remove_path_and_extension(text,output); } char path[FILENAME_MAX],filename[FILENAME_MAX]; get_path(text,path); if(strlen(path) == 0){ strcpy(path,"."); } /* we create files which will contain statistics extracted from the tagged corpus */ U_FILE* rforms_file = NULL, *iforms_file = NULL; if(r_forms == 1){ sprintf(filename,"%s_data_cat.dic",output); new_file(path,filename,raw_forms); rforms_file=u_fopen_creating_versatile_encoding(encoding_output,bom_output,raw_forms,U_WRITE); } if(i_forms == 1){ sprintf(filename,"%s_data_morph.dic",output); new_file(path,filename,inflected_forms); iforms_file=u_fopen_creating_versatile_encoding(encoding_output,bom_output,inflected_forms,U_WRITE); } u_printf("Gathering statistics from tagged corpus...\n"); do_training(input_text,rforms_file,iforms_file); /* we close all files and then we sort text dictionaries */ u_fclose(input_text); char disclaimer[FILENAME_MAX]; if(rforms_file != NULL){ u_fclose(rforms_file); pseudo_main_SortTxt(DEFAULT_ENCODING_OUTPUT,DEFAULT_BOM_OUTPUT,ALL_ENCODING_BOM_POSSIBLE,0,0,NULL,NULL,0,raw_forms); strcpy(disclaimer,raw_forms); remove_extension(disclaimer); strcat(disclaimer,".txt"); create_disclaimer(disclaimer); } if(iforms_file != NULL){ u_fclose(iforms_file); pseudo_main_SortTxt(DEFAULT_ENCODING_OUTPUT,DEFAULT_BOM_OUTPUT,ALL_ENCODING_BOM_POSSIBLE,0,0,NULL,NULL,0,inflected_forms); strcpy(disclaimer,inflected_forms); remove_extension(disclaimer); strcat(disclaimer,".txt"); create_disclaimer(disclaimer); } /* we compress dictionaries if option is specified by user (output is ".bin") */ if(binaries == 1){ /* simple forms dictionary */ if(r_forms == 1){ pseudo_main_Compress(DEFAULT_ENCODING_OUTPUT,DEFAULT_BOM_OUTPUT,ALL_ENCODING_BOM_POSSIBLE,0,semitic,raw_forms); } /* compound forms dictionary */ if(i_forms == 1){ pseudo_main_Compress(DEFAULT_ENCODING_OUTPUT,DEFAULT_BOM_OUTPUT,ALL_ENCODING_BOM_POSSIBLE,0,semitic,inflected_forms); } } free_OptVars(vars); u_printf("Done.\n"); return 0; }