int initialize_working_directory(const char *text,int must_create_directory){ char path[FILENAME_MAX]; get_path(text,path); char canonical_name[FILENAME_MAX]; remove_path_and_extension(text, canonical_name); char extension[FILENAME_MAX]; get_extension(text,extension); char working_directory[FILENAME_MAX]; sprintf(working_directory, "%s%s%s%c",path, canonical_name, CASSYS_DIRECTORY_EXTENSION, PATH_SEPARATOR_CHAR); if (must_create_directory != 0) { make_directory(working_directory); } char text_in_wd[FILENAME_MAX]; sprintf(text_in_wd, "%s%s_0%s",working_directory,canonical_name,extension ); copy_file(text_in_wd,text); char snt_dir_text_in_wd[FILENAME_MAX]; get_snt_path(text_in_wd, snt_dir_text_in_wd); if (must_create_directory != 0) { make_directory(snt_dir_text_in_wd); } char original_snt_dir[FILENAME_MAX]; get_snt_path(text,original_snt_dir); copy_directory_snt_content(snt_dir_text_in_wd, original_snt_dir); return 0; }
int initialize_working_directory_before_tokenize(const char*text, int must_create_directory) { char snt_dir[FILENAME_MAX]; get_snt_path(text, snt_dir); if (must_create_directory != 0) { make_cassys_directory(snt_dir); } return 0; }
int main_Untokenize(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } char alphabet[FILENAME_MAX]=""; char token_file[FILENAME_MAX]=""; char dynamicSntDir[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; int range_start,range_stop,use_range; int token_step_number=0; range_start=range_stop=use_range=0; char foo=0; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_Untokenize,lopts_Untokenize,&index))) { switch(val) { case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'd': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty snt dir name\n"); return USAGE_ERROR_CODE; } strcpy(dynamicSntDir,options.vars()->optarg); break; case 't': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty token file name\n"); return USAGE_ERROR_CODE; } strcpy(token_file,options.vars()->optarg); break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&token_step_number,&foo) || token_step_number<=0) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid token numbering argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; case 'r': { int param1 = 0; int param2 = 0; int ret_scan = sscanf(options.vars()->optarg,"%d,%d%c",¶m1,¶m2,&foo); if (ret_scan == 2) { range_start = param1; range_stop = param2; use_range=1; if (((range_start < -1)) || (range_stop < -1)) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid stop count argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } } else if (1!=sscanf(options.vars()->optarg,"%d%c",&range_start,&foo) || (range_start < -1)) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid stop count argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } use_range=1; } break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Untokenize[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } char tokens_txt[FILENAME_MAX]; char text_cod[FILENAME_MAX]; char enter_pos[FILENAME_MAX]; if (dynamicSntDir[0]=='\0') { get_snt_path(argv[options.vars()->optind],dynamicSntDir); } strcpy(text_cod,dynamicSntDir); strcat(text_cod,"text.cod"); strcpy(enter_pos,dynamicSntDir); strcat(enter_pos,"enter.pos"); strcpy(tokens_txt,dynamicSntDir); strcat(tokens_txt,"tokens.txt"); Alphabet* alph=NULL; if (alphabet[0]!='\0') { alph=load_alphabet(&vec,alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); return DEFAULT_ERROR_CODE; } } ABSTRACTMAPFILE* af_text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0); if (af_text_cod==NULL) { error("Cannot open file %s\n",text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } ABSTRACTMAPFILE* af_enter_pos=af_open_mapfile(enter_pos,MAPFILE_OPTION_READ,0); if (af_enter_pos==NULL) { error("Cannot open file %s\n",enter_pos); af_close_mapfile(af_text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } U_FILE* text = u_fopen(&vec,argv[options.vars()->optind],U_WRITE); if (text==NULL) { error("Cannot create text file %s\n",argv[options.vars()->optind]); af_close_mapfile(af_enter_pos); af_close_mapfile(af_text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } struct text_tokens* tok=load_text_tokens(&vec,tokens_txt); u_printf("Untokenizing text...\n"); size_t nb_item = af_get_mapfile_size(af_text_cod)/sizeof(int); const int* buf=(const int*)af_get_mapfile_pointer(af_text_cod); size_t nb_item_enter_pos=0; const int* buf_enter=NULL; if (af_enter_pos!=NULL) { buf_enter=(const int*)af_get_mapfile_pointer(af_enter_pos); if (buf_enter!=NULL) { nb_item_enter_pos=af_get_mapfile_size(af_enter_pos)/sizeof(int); } } size_t count_pos=0; for (size_t i=0;i<nb_item;i++) { int is_in_range=1; if ((use_range!=0) && (i<(size_t)range_start)) { is_in_range=0; } if ((use_range!=0) && (range_stop!=0) && (i>(size_t)range_stop)) { is_in_range=0; } int is_newline=0; if (count_pos<nb_item_enter_pos) { if (i==(size_t)(*(buf_enter+count_pos))) { is_newline = 1; count_pos++; } } if (is_in_range!=0) { if (token_step_number != 0) if ((i%token_step_number)==0) u_fprintf(text,"\n\nToken %d : ", (int)i); if (is_newline!=0) { u_fprintf(text,"\n", tok->token[*(buf+i)]); } else { u_fputs(tok->token[*(buf+i)], text); } } } af_release_mapfile_pointer(af_text_cod,buf); af_release_mapfile_pointer(af_enter_pos,buf_enter); af_close_mapfile(af_enter_pos); af_close_mapfile(af_text_cod); free_text_tokens(tok); u_fclose(text); free_alphabet(alph); u_printf("\nDone.\n"); return SUCCESS_RETURN_CODE; }
int main_SpellCheck(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; char mode=0; char snt[FILENAME_MAX]=""; char txt[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; char output_set=0; char output_op='A'; SpellCheckConfig config; config.max_errors=1; config.max_SP_INSERT=1; config.max_SP_SUPPR=1; config.max_SP_SWAP=1; config.max_SP_CHANGE=1; for (int i=0;i<N_SPSubOp;i++) { config.score[i]=default_scores[i]; } config.min_length1=4; config.min_length2=6; config.min_length3=12; config.input_op='D'; config.keyboard=NULL; config.allow_uppercase_initial=0; char foo; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_SpellCheck,lopts_SpellCheck,&index))) { switch(val) { case 's': { strcpy(snt,options.vars()->optarg); mode='s'; break; } case 'f': { strcpy(txt,options.vars()->optarg); mode='f'; break; } case 'o': { if (options.vars()->optarg!=NULL) { strcpy(output,options.vars()->optarg); } output_set=1; break; } case 'I': { if (!strcmp(options.vars()->optarg,"D") || !strcmp(options.vars()->optarg,"M") || !strcmp(options.vars()->optarg,"U")) { config.input_op=options.vars()->optarg[0]; } else { error("Invalid argument %s for option --input-op: should in [DMU]\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 'O': { if (!strcmp(options.vars()->optarg,"O") || !strcmp(options.vars()->optarg,"A")) { output_op=options.vars()->optarg[0]; } else { error("Invalid argument %s for option --output-op: should in [OA]\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 1: { config.keyboard=get_Keyboard(options.vars()->optarg); if (config.keyboard==NULL) { error("Invalid argument %s for option --keyboard:\nUse --show-keyboards to see possible values\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 2: { print_available_keyboards(U_STDOUT); return SUCCESS_RETURN_CODE; } case 10: { if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_errors,&foo)) { error("Invalid argument %s for --max-errors: should be an integer >=0\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 11: { if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_INSERT,&foo)) { error("Invalid argument %s for --max-insert: should be an integer >=0\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 12: { if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_SUPPR,&foo)) { error("Invalid argument %s for --max-suppr: should be an integer >=0\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 13: { if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_CHANGE,&foo)) { error("Invalid argument %s for --max-change: should be an integer >=0\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 14: { if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_SWAP,&foo)) { error("Invalid argument %s for --max-swap: should be an integer >=0\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 20: { int* scores=config.score; if (N_SPSubOp!=sscanf(options.vars()->optarg,"%d,%d,%d,%d,%d,%d,%d,%d,%d%c", scores,scores+1,scores+2,scores+3,scores+4,scores+5, scores+6,scores+7,scores+8,&foo)) { error("Invalid argument %s for option --scores. See --help-scores\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 21: { usage_scores(); return SUCCESS_RETURN_CODE; } case 22: { if (3!=sscanf(options.vars()->optarg,"%u,%u,%u%c", &config.min_length1,&config.min_length2,&config.min_length3,&foo)) { error("Invalid argument %s for option --min-lengths\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 23: { if (!strcmp(options.vars()->optarg,"yes")) { config.allow_uppercase_initial=1; } else if (!strcmp(options.vars()->optarg,"no")) { config.allow_uppercase_initial=0; } else { error("Invalid argument %s for option --upper-initial\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; } case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_SpellCheck[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind==argc) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (mode==0) { error("You must use either --snt or --file\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } config.n_dics=argc-options.vars()->optind; config.dics=(Dictionary**)malloc(config.n_dics*sizeof(Dictionary*)); if (config.dics==NULL) { alloc_error("main_SpellCheck"); return ALLOC_ERROR_CODE; } for (int i=0;i<config.n_dics;i++) { config.dics[i]=new_Dictionary(&vec,argv[i+options.vars()->optind]); if (config.dics[i]==NULL) { error("Cannot load dictionary %s\n",argv[i+options.vars()->optind]); } } config.out=U_STDOUT; config.n_input_lines=0; config.n_output_lines=0; if (mode=='s') { /* When working with a .snt, we actually want to work on its err file */ get_snt_path(snt,txt); strcat(txt,"err"); /* the output must be dlf, and we note the number of lines in the existing * dlf file, if any */ get_snt_path(snt,output); strcat(output,"dlf.n"); U_FILE* f=u_fopen(&vec,output,U_READ); if (f!=NULL) { u_fscanf(f,"%d",&(config.n_output_lines)); u_fclose(f); } get_snt_path(snt,output); strcat(output,"dlf"); output_set=1; /* and we force the values for -I and -O */ config.input_op='U'; output_op='A'; } else { /* If mode=='f', we don't have anything to do since we already * defined the default output to stdout */ } if (output_set) { if (output_op=='O') { config.out=u_fopen(&vec,output,U_WRITE); } else { config.out=u_fopen(&vec,output,U_APPEND); } if (config.out==NULL) { error("Cannot open output file %s\n",output); for (int i=0;i<config.n_dics;i++) { free_Dictionary(config.dics[i]); } free(config.dics); return DEFAULT_ERROR_CODE; } } config.modified_input=NULL; char modified_input[FILENAME_MAX]=""; if (config.input_op!='D') { strcpy(modified_input,txt); strcat(modified_input,".tmp"); config.modified_input=u_fopen(&vec,modified_input,U_WRITE); if (config.modified_input==NULL) { error("Cannot open tmp file %s\n",modified_input); if (config.out!=U_STDOUT) { u_fclose(config.out); } for (int i=0;i<config.n_dics;i++) { free_Dictionary(config.dics[i]); } free(config.dics); return DEFAULT_ERROR_CODE; } } config.in=u_fopen(&vec,txt,U_READ); if (config.in==NULL) { error("Cannot open file %s\n",txt); u_fclose(config.modified_input); if (config.out!=U_STDOUT) { u_fclose(config.out); } for (int i=0;i<config.n_dics;i++) { free_Dictionary(config.dics[i]); } free(config.dics); return DEFAULT_ERROR_CODE; } /* We perform spellchecking */ spellcheck(&config); /* And we clean */ u_fclose(config.in); if (config.modified_input!=NULL) { /* If we used a tmp file because the input file has to be modified, * it's now time to actually modify it */ u_fclose(config.modified_input); af_remove(txt); af_rename(modified_input,txt); } if (config.out!=U_STDOUT) { u_fclose(config.out); } for (int i=0;i<config.n_dics;i++) { free_Dictionary(config.dics[i]); } free(config.dics); /* Finally, we update the dlf.n and err.n files if mode=='s' */ if (mode=='s') { get_snt_path(snt,output); strcat(output,"err.n"); U_FILE* f=u_fopen(&vec,output,U_WRITE); if (f!=NULL) { u_fprintf(f,"%d",config.n_input_lines); u_fclose(f); } if (config.input_op!='D') { get_snt_path(snt,output); strcat(output,"dlf.n"); U_FILE* fw=u_fopen(&vec,output,U_WRITE); if (fw!=NULL) { u_fprintf(fw,"%d",config.n_output_lines); u_fclose(fw); } } } return SUCCESS_RETURN_CODE; }
char* create_labeled_files_and_directory(const char *text, int next_transducer_label,int must_create_directory,int must_copy_file) { char path[FILENAME_MAX]; get_path(text, path); char canonical_text_name[FILENAME_MAX]; remove_path_and_extension(text, canonical_text_name); char extension[FILENAME_MAX]; get_extension(text, extension); char working_directory[FILENAME_MAX]; sprintf(working_directory, "%s%s%s%c", path, canonical_text_name, CASSYS_DIRECTORY_EXTENSION, PATH_SEPARATOR_CHAR); // copy the text label i- to i char old_labeled_text_name[FILENAME_MAX]; sprintf(old_labeled_text_name, "%s%s_%d%s", working_directory, canonical_text_name, next_transducer_label - 1, extension); char new_labeled_text_name[FILENAME_MAX]; sprintf(new_labeled_text_name, "%s%s_%d%s", working_directory, canonical_text_name, next_transducer_label, extension); char new_labeled_snt_directory[FILENAME_MAX]; get_snt_path(new_labeled_text_name, new_labeled_snt_directory); if (must_create_directory != 0) { make_directory(new_labeled_snt_directory); } if (must_copy_file != 0) { copy_file(new_labeled_text_name, old_labeled_text_name); // create snt directory labeled i char old_labeled_snt_directory[FILENAME_MAX]; get_snt_path(old_labeled_text_name, old_labeled_snt_directory); // copy dictionary files in the new snt directory struct snt_files *old_snt_ = new_snt_files(old_labeled_text_name); struct snt_files *new_snt_ = new_snt_files(new_labeled_text_name); if (fexists(old_snt_->dlc)) { copy_file(new_snt_->dlc, old_snt_->dlc); } if (fexists(old_snt_-> dlf)) { copy_file(new_snt_->dlf, old_snt_->dlf); } if (fexists(old_snt_-> err)) { copy_file(new_snt_->err, old_snt_->err); } if (fexists(old_snt_->dlc_n)) { copy_file(new_snt_->dlc_n, old_snt_->dlc_n); } if (fexists(old_snt_->dlf_n)) { copy_file(new_snt_->dlf_n, old_snt_->dlf_n); } if (fexists(old_snt_-> err_n)) { copy_file(new_snt_->err_n, old_snt_->err_n); } if (fexists(old_snt_->stat_dic_n)) { copy_file(new_snt_->stat_dic_n, old_snt_->stat_dic_n); } free_snt_files(old_snt_); free_snt_files(new_snt_); } char *labeled_text_name; labeled_text_name = (char*)malloc(sizeof(char)*(strlen(new_labeled_text_name)+1)); if(labeled_text_name == NULL){ perror("malloc\n"); fprintf(stderr,"Impossible to allocate memory\n"); exit(1); } strcpy(labeled_text_name, new_labeled_text_name); return labeled_text_name; }
int main_Tokenize(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } char alphabet[FILENAME_MAX]=""; char token_file[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; int mode=NORMAL; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Tokenize,lopts_Tokenize,&index,vars))) { switch(val) { case 'a': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty alphabet file name\n"); } strcpy(alphabet,vars->optarg); break; case 'c': mode=CHAR_BY_CHAR; break; case 'w': mode=NORMAL; break; case 't': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty token file name\n"); } strcpy(token_file,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_Tokenize[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } U_FILE* text; U_FILE* out; U_FILE* output; U_FILE* enter; char tokens_txt[FILENAME_MAX]; char text_cod[FILENAME_MAX]; char enter_pos[FILENAME_MAX]; Alphabet* alph=NULL; get_snt_path(argv[vars->optind],text_cod); strcat(text_cod,"text.cod"); get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tokens.txt"); get_snt_path(argv[vars->optind],enter_pos); strcat(enter_pos,"enter.pos"); text=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (text==NULL) { fatal_error("Cannot open text file %s\n",argv[vars->optind]); } if (alphabet[0]!='\0') { alph=load_alphabet(alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); u_fclose(text); return 1; } } out=u_fopen(BINARY,text_cod,U_WRITE); if (out==NULL) { error("Cannot create file %s\n",text_cod); u_fclose(text); if (alph!=NULL) { free_alphabet(alph); } return 1; } enter=u_fopen(BINARY,enter_pos,U_WRITE); if (enter==NULL) { error("Cannot create file %s\n",enter_pos); u_fclose(text); u_fclose(out); if (alph!=NULL) { free_alphabet(alph); } return 1; } vector_ptr* tokens=new_vector_ptr(4096); vector_int* n_occur=new_vector_int(4096); vector_int* n_enter_pos=new_vector_int(4096); struct hash_table* hashtable=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal, (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy); if (token_file[0]!='\0') { load_token_file(token_file,mask_encoding_compatibility_input,tokens,hashtable,n_occur); } output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot create file %s\n",tokens_txt); u_fclose(text); u_fclose(out); u_fclose(enter); if (alph!=NULL) { free_alphabet(alph); } free_hash_table(hashtable); free_vector_ptr(tokens,free); free_vector_int(n_occur); free_vector_int(n_enter_pos); return 1; } u_fprintf(output,"0000000000\n"); int SENTENCES=0; int TOKENS_TOTAL=0; int WORDS_TOTAL=0; int DIGITS_TOTAL=0; u_printf("Tokenizing text...\n"); if (mode==NORMAL) { normal_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos, &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL); } else { char_by_char_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos, &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL); } u_printf("\nDone.\n"); save_new_line_positions(enter,n_enter_pos); u_fclose(enter); u_fclose(text); u_fclose(out); u_fclose(output); write_number_of_tokens(tokens_txt,encoding_output,bom_output,tokens->nbelems); // we compute some statistics get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"stats.n"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { compute_statistics(output,tokens,alph,SENTENCES,TOKENS_TOTAL,WORDS_TOTAL,DIGITS_TOTAL); u_fclose(output); } // we save the tokens by frequence get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tok_by_freq.txt"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { sort_and_save_by_frequence(output,tokens,n_occur); u_fclose(output); } // we save the tokens by alphabetical order get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tok_by_alph.txt"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { sort_and_save_by_alph_order(output,tokens,n_occur); u_fclose(output); } free_hash_table(hashtable); free_vector_ptr(tokens,free); free_vector_int(n_occur); free_vector_int(n_enter_pos); if (alph!=NULL) { free_alphabet(alph); } free_OptVars(vars); return 0; }