/** * Reads the start and end positions of each token stored in the file * produced by Tokenize's --output_offsets option. */ vector_uima_offset* load_uima_offsets(const VersatileEncodingConfig* vec,const char* name) { U_FILE* f; f=u_fopen(vec,name,U_READ); if (f==NULL) { return NULL; } vector_int* v=new_vector_int(); Ustring* line=new_Ustring(); int a,b,c; while (EOF!=readline(line,f)) { u_sscanf(line->str,"%d%d%d",&a,&b,&c); vector_int_add(v,b); vector_int_add(v,c); } free_Ustring(line); u_fclose(f); return (vector_uima_offset*)v; }
/** * Loads snt offsets from the given binary file. */ vector_int* load_snt_offsets(const char* name) { U_FILE* f=u_fopen(BINARY,name,U_READ); if (f==NULL) return NULL; long size=get_file_size(f); if (size%(3*sizeof(int))!=0) { u_fclose(f); return NULL; } vector_int* v=new_vector_int((int)(size/sizeof(int))); if (size!=0) { int n=(int)fread(v->tab,sizeof(int),size/sizeof(int),f); u_fclose(f); if (n!=(int)(size/sizeof(int))) { free_vector_int(v); return NULL; } v->nbelems=v->size; } return v; }
int main_Tokenize(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } char alphabet[FILENAME_MAX]=""; char token_file[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; int mode=NORMAL; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Tokenize,lopts_Tokenize,&index,vars))) { switch(val) { case 'a': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty alphabet file name\n"); } strcpy(alphabet,vars->optarg); break; case 'c': mode=CHAR_BY_CHAR; break; case 'w': mode=NORMAL; break; case 't': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty token file name\n"); } strcpy(token_file,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_Tokenize[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } U_FILE* text; U_FILE* out; U_FILE* output; U_FILE* enter; char tokens_txt[FILENAME_MAX]; char text_cod[FILENAME_MAX]; char enter_pos[FILENAME_MAX]; Alphabet* alph=NULL; get_snt_path(argv[vars->optind],text_cod); strcat(text_cod,"text.cod"); get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tokens.txt"); get_snt_path(argv[vars->optind],enter_pos); strcat(enter_pos,"enter.pos"); text=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (text==NULL) { fatal_error("Cannot open text file %s\n",argv[vars->optind]); } if (alphabet[0]!='\0') { alph=load_alphabet(alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); u_fclose(text); return 1; } } out=u_fopen(BINARY,text_cod,U_WRITE); if (out==NULL) { error("Cannot create file %s\n",text_cod); u_fclose(text); if (alph!=NULL) { free_alphabet(alph); } return 1; } enter=u_fopen(BINARY,enter_pos,U_WRITE); if (enter==NULL) { error("Cannot create file %s\n",enter_pos); u_fclose(text); u_fclose(out); if (alph!=NULL) { free_alphabet(alph); } return 1; } vector_ptr* tokens=new_vector_ptr(4096); vector_int* n_occur=new_vector_int(4096); vector_int* n_enter_pos=new_vector_int(4096); struct hash_table* hashtable=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal, (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy); if (token_file[0]!='\0') { load_token_file(token_file,mask_encoding_compatibility_input,tokens,hashtable,n_occur); } output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot create file %s\n",tokens_txt); u_fclose(text); u_fclose(out); u_fclose(enter); if (alph!=NULL) { free_alphabet(alph); } free_hash_table(hashtable); free_vector_ptr(tokens,free); free_vector_int(n_occur); free_vector_int(n_enter_pos); return 1; } u_fprintf(output,"0000000000\n"); int SENTENCES=0; int TOKENS_TOTAL=0; int WORDS_TOTAL=0; int DIGITS_TOTAL=0; u_printf("Tokenizing text...\n"); if (mode==NORMAL) { normal_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos, &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL); } else { char_by_char_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos, &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL); } u_printf("\nDone.\n"); save_new_line_positions(enter,n_enter_pos); u_fclose(enter); u_fclose(text); u_fclose(out); u_fclose(output); write_number_of_tokens(tokens_txt,encoding_output,bom_output,tokens->nbelems); // we compute some statistics get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"stats.n"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { compute_statistics(output,tokens,alph,SENTENCES,TOKENS_TOTAL,WORDS_TOTAL,DIGITS_TOTAL); u_fclose(output); } // we save the tokens by frequence get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tok_by_freq.txt"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { sort_and_save_by_frequence(output,tokens,n_occur); u_fclose(output); } // we save the tokens by alphabetical order get_snt_path(argv[vars->optind],tokens_txt); strcat(tokens_txt,"tok_by_alph.txt"); output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE); if (output==NULL) { error("Cannot write %s\n",tokens_txt); } else { sort_and_save_by_alph_order(output,tokens,n_occur); u_fclose(output); } free_hash_table(hashtable); free_vector_ptr(tokens,free); free_vector_int(n_occur); free_vector_int(n_enter_pos); if (alph!=NULL) { free_alphabet(alph); } free_OptVars(vars); return 0; }