void main(int argc, char *argv[]) { int i,j; char *vocab_filename; FILE *tempfile; char tempfiles_directory[1000]; int vocab_size; FILE *vocab_file; int verbosity; int buffer_size; int position_in_buffer; int number_of_tempfiles; int max_files; int fof_size; unsigned short *buffer; unsigned short *placeholder; unsigned short *temp_ngram; int temp_count; char temp_word[500]; char temp_word2[500]; char *temp_file_root; char *temp_file_ext; char *host_name; int proc_id; struct utsname uname_info; flag write_ascii; /* Vocab hash table things */ struct hash_table vocabulary; unsigned long hash_size; unsigned long M; tempfile = NULL; /* Just to prevent compilation warnings. */ report_version(&argc,argv); verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY); /* Process command line */ if (pc_flagarg( &argc, argv,"-help") || argc==1) { fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n"); fprintf(stderr,"Usage : text2idngram -vocab .vocab \n"); fprintf(stderr," [ -buffer 100 ]\n"); fprintf(stderr," [ -hash %d ]\n",DEFAULT_HASH_SIZE); fprintf(stderr," [ -temp %s ]\n",DEFAULT_TEMP); fprintf(stderr," [ -files %d ]\n",DEFAULT_MAX_FILES); fprintf(stderr," [ -gzip | -compress ]\n"); fprintf(stderr," [ -verbosity %d ]\n", DEFAULT_VERBOSITY); fprintf(stderr," [ -n 3 ]\n"); fprintf(stderr," [ -write_ascii ]\n"); fprintf(stderr," [ -fof_size 10 ]\n"); exit(1); } pc_message(verbosity,2,"text2idngram\n"); n = pc_intarg( &argc, argv, "-n",DEFAULT_N); placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n); temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n); hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE); buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM); write_ascii = pc_flagarg(&argc,argv,"-write_ascii"); fof_size = pc_intarg(&argc,argv,"-fof_size",10); max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES); vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" )); if (!strcmp("",vocab_filename)) { quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n"); } strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp", DEFAULT_TEMP)); if (pc_flagarg(&argc,argv,"-compress")) { temp_file_ext = salloc(".Z"); } else { if (pc_flagarg(&argc,argv,"-gzip")) { temp_file_ext = salloc(".gz"); } else { temp_file_ext = salloc(""); } } uname(&uname_info); host_name = salloc(uname_info.nodename); proc_id = getpid(); sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id); temp_file_root = salloc(temp_word); pc_report_unk_args(&argc,argv,verbosity); /* If the last charactor in the directory name isn't a / then add one. */ if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') { strcat(tempfiles_directory,"/"); } pc_message(verbosity,2,"Vocab : %s\n",vocab_filename); pc_message(verbosity,2,"N-gram buffer size : %d\n",buffer_size); pc_message(verbosity,2,"Hash table size : %d\n",hash_size); pc_message(verbosity,2,"Temp directory : %s\n",tempfiles_directory); pc_message(verbosity,2,"Max open files : %d\n",max_files); pc_message(verbosity,2,"FOF size : %d\n",fof_size); pc_message(verbosity,2,"n : %d\n",n); buffer_size *= (1000000/(sizeof(unsigned short)*n)); /* Allocate memory for hash table */ fprintf(stderr,"Initialising hash table...\n"); M = nearest_prime(hash_size); new_hashtable(&vocabulary,M); /* Read in the vocabulary */ vocab_size = 0; vocab_file = rr_iopen(vocab_filename); pc_message(verbosity,2,"Reading vocabulary...\n"); while (fgets (temp_word, sizeof(temp_word),vocab_file)) { if (strncmp(temp_word,"##",2)==0) continue; sscanf (temp_word, "%s ",temp_word2); /* Check for repeated words in the vocabulary */ if (index2(&vocabulary,temp_word2) != 0) { fprintf(stderr,"======================================================\n"); fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word); fprintf(stderr,"=======================================================\n"); } if (strncmp(temp_word,"#",1)==0) { fprintf(stderr,"\n\n===========================================================\n"); fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n"); fprintf(stderr, ">>> %s <<<\n",temp_word); fprintf(stderr, " '%s' will be included in the vocabulary.\n",temp_word2); fprintf(stderr, " (comments must start with '##')\n"); fprintf(stderr,"===========================================================\n\n"); } vocab_size++; add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size); } if (vocab_size > MAX_VOCAB_SIZE) { quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n"); } pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n"); buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short)); number_of_tempfiles = 0; /* Read text into buffer */ /* Read in the first ngram */ position_in_buffer = 0; for (i=0;i<=n-1;i++) { get_word(stdin,temp_word); add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer); } while (!rr_feof(stdin)) { /* Fill up the buffer */ pc_message(verbosity,2,"Reading text into the n-gram buffer...\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) { position_in_buffer++; if (position_in_buffer % 20000 == 0) { if (position_in_buffer % 1000000 == 0) { pc_message(verbosity,2,".\n"); } else { pc_message(verbosity,2,"."); } } for (i=1;i<=n-1;i++) { add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer), position_in_buffer,i-1,buffer); } if (get_word(stdin,temp_word) == 1) { add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer, n-1,buffer); } } for (i=0;i<=n-1;i++) { placeholder[i] = buffer_contents(position_in_buffer,i,buffer); } /* Sort buffer */ pc_message(verbosity,2,"\nSorting n-grams...\n"); qsort((void*) buffer,(size_t) position_in_buffer, n*sizeof(unsigned short),compare_ngrams); /* Output the buffer to temporary BINARY file */ number_of_tempfiles++; sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n", temp_word); tempfile = rr_oopen(temp_word); for (i=0;i<=n-1;i++) { temp_ngram[i] = buffer_contents(0,i,buffer); if (temp_ngram[i] > MAX_VOCAB_SIZE) { quit(-1,"Invalid trigram in buffer.\nAborting"); } } temp_count = 1; for (i=1;i<=position_in_buffer;i++) { if (!compare_ngrams(temp_ngram,&buffer[i*n])) { temp_count++; } else { for (j=0;j<=n-1;j++) { rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1, tempfile,"temporary n-gram ids"); temp_ngram[j] = buffer_contents(i,j,buffer); } rr_fwrite(&temp_count,sizeof(int),1,tempfile, "temporary n-gram counts"); temp_count = 1; } } rr_oclose(tempfile); for (i=0;i<=n-1;i++) { add_to_buffer(placeholder[i],0,i,buffer); } position_in_buffer = 0; } /* Merge the temporary files, and output the result to standard output */ pc_message(verbosity,2,"Merging temporary files...\n"); merge_tempfiles(1, number_of_tempfiles, temp_file_root, temp_file_ext, max_files, tempfiles_directory, stdout, write_ascii, fof_size); pc_message(verbosity,0,"text2idngram : Done.\n"); exit(0); }
void compute_perplexity(ng_t *ng, arpa_lm_t *arpa_ng, char *text_stream_filename, char *probs_stream_filename, char *annotation_filename, char *oov_filename, char *fb_list_filename, flag backoff_from_unk_inc, flag backoff_from_unk_exc, flag backoff_from_ccs_inc, flag backoff_from_ccs_exc, flag arpa_lm, flag include_unks, double log_base) { fb_info *fb_list; FILE *temp_fp; FILE *text_stream_fp; FILE *probs_stream_fp; FILE *annotation_fp; FILE *oov_fp; flag out_probs; flag annotate; flag out_oovs; flag found_unk_wrongly; double prob; double sum_log_prob; int total_words; int excluded_unks; int excluded_ccs; char current_word[1000]; /* Hope that's big enough */ char **prev_words; vocab_sz_t current_id; id__t short_current_id; id__t *context; int context_length; int i; int bo_case; int actual_context_length; int *ngrams_hit; int n; /* Initialise file pointers to prevent warnings from the compiler. */ probs_stream_fp = NULL; annotation_fp = NULL; oov_fp = NULL; short_current_id = 0; found_unk_wrongly = 0; annotate = 0; bo_case = 0; if (arpa_lm) { n = arpa_ng->n; fb_list = gen_fb_list(arpa_ng->vocab_ht, (int) arpa_ng->vocab_size, arpa_ng->vocab, arpa_ng->context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); }else { n = ng->n; fb_list = gen_fb_list(ng->vocab_ht, (int) ng->vocab_size, ng->vocab, ng->context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); } ngrams_hit = (int *) rr_calloc(n,sizeof(int)); prev_words = (char **) rr_malloc(sizeof(char *)*n); for (i=0;i<=n-1;i++) prev_words[i] = (char *) rr_malloc(sizeof(char)*1000); /* Check that text_stream_filename and probs_stream_filename (if specified) are valid. Note that the checks employed by the standard rr_fopen tools are not suitable here, since we don't want the program to terminate if the paths are not found. */ if (!strcmp(text_stream_filename,"")) { printf("Error : Must specify a text file. Use the -text switch.\n"); return; } if (!rr_fexists(text_stream_filename) && strcmp(text_stream_filename,"-")) { printf("Error : Can't open file %s for reading.\n",text_stream_filename); return; } out_probs = strcmp(probs_stream_filename,""); annotate = strcmp(annotation_filename,""); out_oovs = strcmp(oov_filename,""); printf("Computing perplexity of the language model with respect\n"); printf(" to the text %s\n",text_stream_filename); if (out_probs) printf("Probability stream will be written to file %s\n", probs_stream_filename); if (annotate) printf("Annotation will be written to file %s\n", annotation_filename); if (out_oovs) printf("Out of vocabulary words will be written to file %s\n", oov_filename); if (backoff_from_unk_inc) printf("Will force inclusive back-off from OOVs.\n"); if (backoff_from_unk_exc) printf("Will force exclusive back-off from OOVs.\n"); if (backoff_from_ccs_inc) printf("Will force inclusive back-off from context cues.\n"); if (backoff_from_ccs_exc) printf("Will force exclusive back-off from context cues.\n"); if (strcmp(fb_list_filename,"")) printf("Will force back-off according to the contents of %s\n", fb_list_filename); if (include_unks) printf("Perplexity calculation will include OOVs.\n"); /* Check for existance of files, as rr functions will quit, which isn't what we want */ if (out_probs && strcmp(probs_stream_filename,"-")) { if ((temp_fp = fopen(probs_stream_filename,"w")) == NULL) { printf("Error : Can't open file %s for writing.\n",probs_stream_filename); return; } fclose(temp_fp); } if (annotate && strcmp(annotation_filename,"-")) { if ((temp_fp = fopen(annotation_filename,"w")) == NULL) { printf("Error : Can't open file %s for writing.\n",annotation_filename); return; } fclose(temp_fp); } if (out_oovs && strcmp(oov_filename,"-")) { if ((temp_fp = fopen(oov_filename,"w")) == NULL) { printf("Error : Can't open file %s for writing.\n",oov_filename); return; } fclose(temp_fp); } text_stream_fp = rr_iopen(text_stream_filename); if (out_probs) probs_stream_fp = rr_oopen(probs_stream_filename); if (annotate) annotation_fp = rr_oopen(annotation_filename); if (out_oovs) oov_fp = rr_oopen(oov_filename); context = (id__t *) rr_malloc(sizeof(id__t)*(n-1)); sum_log_prob = 0.0; total_words = 0; excluded_unks = 0; excluded_ccs = 0; while (!rr_feof(text_stream_fp)) { if (total_words > 0) { if (total_words < n) strcpy(prev_words[total_words-1],current_word); else { for (i=0;i<=n-3;i++) strcpy(prev_words[i],prev_words[i+1]); if (n>1) strcpy(prev_words[n-2],current_word); } } if (total_words < (n-1)) context_length = total_words; else context_length = n-1; /* Fill context with right stuff */ if (total_words > (n-1)) { for (i=0;i<=context_length-2;i++) context[i] = context[i+1]; } if (context_length != 0) context[context_length-1] = short_current_id; if (fscanf(text_stream_fp,"%s",current_word) != 1) { if (!rr_feof(text_stream_fp)) { printf("Error reading text file.\n"); return; } } if (!rr_feof(text_stream_fp)) { if (arpa_lm) { sih_lookup(arpa_ng->vocab_ht,current_word,¤t_id); if (arpa_ng->vocab_type == CLOSED_VOCAB && current_id == 0) { found_unk_wrongly = 1; printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word); } if (current_id > arpa_ng->vocab_size) quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); else short_current_id = current_id; }else { sih_lookup(ng->vocab_ht,current_word,¤t_id); if (ng->vocab_type == CLOSED_VOCAB && current_id == 0) { found_unk_wrongly = 1; printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word); } if (current_id > ng->vocab_size) quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); else short_current_id = current_id; } if (!found_unk_wrongly) { if (current_id == 0 && out_oovs) fprintf(oov_fp,"%s\n",current_word); if ((arpa_lm && (!(arpa_ng->context_cue[current_id]))) || ((!arpa_lm) && (!(ng->context_cue[current_id])))) { if (include_unks || current_id != 0) { prob = calc_prob_of(short_current_id, context, context_length, ng, arpa_ng, fb_list, &bo_case, &actual_context_length, arpa_lm); if (prob<= 0.0 || prob > 1.0) { fprintf(stderr,"Warning : "); if (short_current_id == 0) fprintf(stderr,"P( <UNK> | "); else fprintf(stderr,"P( %s | ",current_word); for (i=0;i<=actual_context_length-1;i++) { if (context[i+context_length-actual_context_length] == 0) fprintf(stderr,"<UNK> "); else fprintf(stderr,"%s ",prev_words[i]); } fprintf(stderr,") = %g logprob = %g \n ",prob,log(prob)/log(log_base)); fprintf(stderr,"bo_case == 0x%dx, actual_context_length == %d\n", bo_case, actual_context_length); } if (annotate) { if (short_current_id == 0) fprintf(annotation_fp,"P( <UNK> | "); else fprintf(annotation_fp,"P( %s | ",current_word); for (i=0;i<=actual_context_length-1;i++) { if (context[i+context_length-actual_context_length] == 0) fprintf(annotation_fp,"<UNK> "); else { if (arpa_lm) fprintf(annotation_fp,"%s ",arpa_ng->vocab[context[i+context_length-actual_context_length]]); else fprintf(annotation_fp,"%s ",ng->vocab[context[i+context_length-actual_context_length]]); } } fprintf(annotation_fp,") = %g logprob = %f bo_case = ",prob,log(prob)/log(log_base)); decode_bo_case(bo_case,actual_context_length,annotation_fp); } /* Calculate level to which we backed off */ for (i=actual_context_length-1;i>=0;i--) { int four_raise_i = 1<<(2*i); /* PWP */ /* * PWP: This was "if ((bo_case / (int) pow(3,i)) == 0)" * but was getting a divide-by-zero error on an Alpha * (it isn't clear to me why it should ever have done so) * Anyway, it is much faster to do in base-4. */ if ((bo_case == 0) || ((bo_case / four_raise_i) == 0)) { ngrams_hit[i+1]++; i = -2; }else bo_case -= ((bo_case / four_raise_i) * four_raise_i); } if (i != -3) ngrams_hit[0]++; if (out_probs) fprintf(probs_stream_fp,"%g\n",prob); sum_log_prob += log10(prob); } if (current_id == 0 && !include_unks) excluded_unks++; } else { if (((!arpa_lm) && ng->context_cue[current_id]) || (arpa_lm && arpa_ng->context_cue[current_id])) excluded_ccs++; } total_words++; } } } if (!found_unk_wrongly) { /* pow(x,y) = e**(y ln(x)) */ printf("Perplexity = %.2f, Entropy = %.2f bits\n", exp(-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * log(10.0)), (-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * log(10.0) / log(2.0))); printf("Computation based on %d words.\n", total_words-excluded_ccs-excluded_unks); for(i=n;i>=1;i--) { printf("Number of %d-grams hit = %d (%.2f%%)\n",i,ngrams_hit[i-1], (float) 100*ngrams_hit[i-1]/(total_words-excluded_ccs-excluded_unks) ); } printf("%d OOVs (%.2f%%) and %d context cues were removed from the calculation.\n", excluded_unks, (float) 100*excluded_unks/(total_words-excluded_ccs),excluded_ccs); } rr_iclose(text_stream_fp); if (out_probs) rr_oclose(probs_stream_fp); if (annotate) rr_oclose(annotation_fp); if (out_oovs) rr_oclose(oov_fp); free (fb_list); free (context); free (ngrams_hit); }
int main (int argc, char **argv) { int n; int verbosity; int max_files; int max_words; int max_chars; int current_word; int current_char; int start_char; /* start boundary (possibly > than 0) */ int no_of_spaces; int pos_in_string; int i; char *current_string; char current_temp_filename[500]; int current_file_number; FILE *temp_file; flag text_buffer_full; char *text_buffer; char **pointers; char current_ngram[500]; int current_count; int counter; char temp_directory[1000]; char *temp_file_ext; flag words_set; flag chars_set; /* Process command line */ verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY); pc_message(verbosity,2,"text2wngram\n"); report_version(&argc,argv); if (pc_flagarg( &argc, argv,"-help")) { help_message(); exit(1); } n = pc_intarg(&argc, argv,"-n",DEFAULT_N); /* max_words = pc_intarg(&argc, argv,"-words",STD_MEM*1000000/11); max_chars = pc_intarg(&argc, argv,"-chars",STD_MEM*7000000/11); */ max_words = pc_intarg(&argc, argv,"-words",-1); max_chars = pc_intarg(&argc, argv,"-chars",-1); if (max_words == -1) { words_set = 0; max_words = STD_MEM*1000000/11; }else words_set = 1; if (max_chars == -1) { chars_set = 0; max_chars = STD_MEM*7000000/11; }else chars_set = 1; max_files = pc_intarg(&argc, argv,"-files",DEFAULT_MAX_FILES); if (pc_flagarg(&argc,argv,"-compress")) temp_file_ext = salloc(".Z"); else { if (pc_flagarg(&argc,argv,"-gzip")) temp_file_ext = salloc(".gz"); else temp_file_ext = salloc(""); } strcpy(temp_directory, "cmuclmtk-XXXXXX"); if (mkdtemp(temp_directory) == NULL) { quit(-1, "Failed to create temporary folder: %s\n", strerror(errno)); } pc_report_unk_args(&argc,argv,verbosity); if (words_set && !chars_set) max_chars = max_words * 7; if (!words_set && chars_set) max_words = max_chars / 7; /* If the last charactor in the directory name isn't a / then add one. */ pc_message(verbosity,2,"n = %d\n",n); pc_message(verbosity,2,"Number of words in buffer = %d\n",max_words); pc_message(verbosity,2,"Number of chars in buffer = %d\n",max_chars); pc_message(verbosity,2,"Max number of files open at once = %d\n",max_files); pc_message(verbosity,2,"Temporary directory = %s\n",temp_directory); /* Allocate memory for the buffers */ text_buffer = (char *) rr_malloc(sizeof(char)*max_chars); pc_message(verbosity,2,"Allocated %d bytes to text buffer.\n", sizeof(char)*max_chars); pointers = (char **) rr_malloc(sizeof(char *)*max_words); pc_message(verbosity,2,"Allocated %d bytes to pointer array.\n", sizeof(char *)*max_words); current_file_number = 0; current_word = 1; start_char = 0; current_char = 0; counter = 0; pointers[0] = text_buffer; while (!feof(stdin)) { current_file_number++; /* Read text into buffer */ pc_message(verbosity,2,"Reading text into buffer...\n"); pc_message(verbosity,2,"Reading text into the n-gram buffer...\n"); pc_message(verbosity,2,"20,000 words processed for each \".\", 1,000,000 for each line.\n"); pointers[0] = text_buffer; while ((!rr_feof(stdin)) && (current_word < max_words) && (current_char < max_chars)) { text_buffer[current_char] = getchar(); if (text_buffer[current_char] == '\n' || text_buffer[current_char] == '\t' ) { text_buffer[current_char] = ' '; } if (text_buffer[current_char] == ' ') { if (current_char > start_char) { if (text_buffer[current_char-1] == ' ') { current_word--; current_char--; } pointers[current_word] = &(text_buffer[current_char+1]); current_word++; counter++; if (counter % 20000 == 0) { if (counter % 1000000 == 0) pc_message(verbosity,2,"\n"); else pc_message(verbosity,2,"."); } } } if (text_buffer[current_char] != ' ' || current_char > start_char) current_char++; } text_buffer[current_char]='\0'; if (current_word == max_words || rr_feof(stdin)) { for (i=current_char+1;i<=max_chars-1;i++) text_buffer[i] = ' '; text_buffer_full = 0; }else text_buffer_full = 1; /* Sort buffer */ pc_message(verbosity,2,"\nSorting pointer array...\n"); qsort((void *) pointers,(size_t) current_word-n,sizeof(char *),cmp_strings); /* Write out temporary file */ sprintf(current_temp_filename,"%s/%hu%s",temp_directory, current_file_number, temp_file_ext); pc_message(verbosity,2,"Writing out temporary file %s...\n",current_temp_filename); temp_file = rr_oopen(current_temp_filename); text_buffer[current_char] = ' '; current_count = 0; strcpy(current_ngram,""); for (i = 0; i <= current_word-n; i++) { current_string = pointers[i]; /* Find the nth space */ no_of_spaces = 0; pos_in_string = 0; while (no_of_spaces < n) { if (current_string[pos_in_string] == ' ') no_of_spaces++; pos_in_string++; } if (!strncmp(current_string,current_ngram,pos_in_string)) current_count++; else { if (strcmp(current_ngram,"")) if (fprintf(temp_file,"%s %d\n",current_ngram,current_count) < 0) quit(-1,"Error writing to temporary file %s\n",current_temp_filename); current_count = 1; strncpy(current_ngram,current_string,pos_in_string); current_ngram[pos_in_string] = '\0'; } } rr_oclose(temp_file); /* Move the last n-1 words to the beginning of the buffer, and set correct current_word and current_char things */ strcpy(text_buffer,pointers[current_word-n]); pointers[0]=text_buffer; /* Find the (n-1)th space */ no_of_spaces=0; pos_in_string=0; if (!text_buffer_full){ while (no_of_spaces<(n-1)) { if (pointers[0][pos_in_string]==' ') { no_of_spaces++; pointers[no_of_spaces] = &pointers[0][pos_in_string+1]; } pos_in_string++; } }else { while (no_of_spaces<n) { if (pointers[0][pos_in_string]==' ') { no_of_spaces++; pointers[no_of_spaces] = &pointers[0][pos_in_string+1]; } pos_in_string++; } pos_in_string--; } current_char = pos_in_string; current_word = n; /* mark boundary beyond which counting pass cannot backup */ start_char = current_char; } /* Merge temporary files */ pc_message(verbosity,2,"Merging temporary files...\n"); merge_tempfiles(1, current_file_number, temp_directory, temp_file_ext, max_files, stdout, n, verbosity); rmdir(temp_directory); pc_message(verbosity,0,"text2wngram : Done.\n"); return 0; }
void merge_idngramfiles (int start_file, int end_file, char *temp_file_root, char *temp_file_ext, int max_files, FILE *outfile, flag write_ascii, int fof_size, int n_order) { FILE *new_temp_file; char temp_string[1000]; char *new_temp_filename; FILE **temp_file; char **temp_filename; wordid_t **current_ngram; wordid_t *smallest_ngram; wordid_t *previous_ngram; int *current_ngram_count; flag *finished; flag all_finished; int temp_count; int i,j; flag first_ngram; fof_t **fof_array; ngram_sz_t *num_kgrams; int *ng_count; int pos_of_novelty; n = n_order; pos_of_novelty = n; /* Simply for warning-free compilation */ num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t)); ng_count = (int *) rr_calloc(n-1,sizeof(int)); first_ngram = 1; previous_ngram = (wordid_t *) rr_calloc(n,sizeof(wordid_t)); temp_file = (FILE **) rr_malloc(sizeof(FILE *) * (end_file-start_file+1)); temp_filename = (char **) rr_malloc(sizeof(char *) * (end_file-start_file+1)); /* should change to 2d array*/ current_ngram = (wordid_t **) rr_malloc(sizeof(wordid_t *) * (end_file-start_file+1)); for (i=0;i<=end_file-start_file;i++) current_ngram[i] = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); current_ngram_count = (int *) rr_malloc(sizeof(int)*(end_file-start_file+1)); finished = (flag *) rr_malloc(sizeof(flag)*(end_file-start_file+1)); smallest_ngram = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); /* should change to 2d array*/ fof_array = (fof_t **) rr_malloc(sizeof(fof_t *)*(n-1)); for (i=0;i<=n-2;i++) fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t)); if (end_file-start_file+1 > max_files) { sprintf(temp_string,"%s/%hu%s",temp_file_root, end_file+1,temp_file_ext); new_temp_filename = salloc(temp_string); new_temp_file = rr_oopen(new_temp_filename); merge_tempfiles(start_file,start_file+max_files-1, temp_file_root,temp_file_ext,max_files, new_temp_file,write_ascii,0); merge_tempfiles(start_file+max_files,end_file+1, temp_file_root,temp_file_ext,max_files, outfile,write_ascii,0); }else { /* Open all the temp files for reading */ for (i=0;i<=end_file-start_file;i++) { sprintf(temp_string,"%s/%hu%s",temp_file_root, i+start_file,temp_file_ext); temp_filename[i] = salloc(temp_string); temp_file[i] = rr_iopen(temp_filename[i]); } /* Now go through the files simultaneously, and write out the appropriate ngram counts to the output file. */ for (i=end_file-start_file;i>=0;i--) { finished[i] = 0; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { rr_fread((char*) ¤t_ngram[i][j], sizeof(wordid_t),1, temp_file[i],"temporary n-gram ids",0); } rr_fread((char*) ¤t_ngram_count[i], sizeof(int),1, temp_file[i],"temporary n-gram counts",0); } } all_finished = 0; while (!all_finished) { /* Find the smallest current ngram */ for (i=0;i<=n-1;i++) smallest_ngram[i] = MAX_WORDID; for (i=0;i<=end_file-start_file;i++) { if (!finished[i]) { if (compare_ngrams3(smallest_ngram,current_ngram[i]) < 0) { for (j=0;j<n;j++) smallest_ngram[j] = current_ngram[i][j]; } } } #if MAX_VOCAB_SIZE < 65535 /* This check is well-meaning but completely useless since smallest_ngram[i] by definition cannot contain any value greater than MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */ for (i=0;i<=n-1;i++) { if (smallest_ngram[i] > MAX_VOCAB_SIZE) { quit(-1,"Error : Temporary files corrupted, invalid n-gram found.\n"); } } #endif /* For each of the files that are currently holding this ngram, add its count to the temporary count, and read in a new ngram from the files. */ temp_count = 0; for (i=0;i<=end_file-start_file;i++) { if (!finished[i]) { if (compare_ngrams3(smallest_ngram,current_ngram[i]) == 0) { temp_count = temp_count + current_ngram_count[i]; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { rr_fread((char*) ¤t_ngram[i][j],sizeof(wordid_t),1, temp_file[i],"temporary n-gram ids",0); } rr_fread((char*)¤t_ngram_count[i],sizeof(int),1, temp_file[i],"temporary n-gram count",0); }else { finished[i] = 1; all_finished = 1; for (j=0;j<=end_file-start_file;j++) { if (!finished[j]) all_finished = 0; } } } } } if (write_ascii) { for (i=0;i<=n-1;i++) { if (fprintf(outfile,"%d ",smallest_ngram[i]) < 0) { quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n"); } } if (fprintf(outfile,"%d\n",temp_count) < 0) quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n"); }else { for (i=0;i<=n-1;i++) { rr_fwrite((char*)&smallest_ngram[i],sizeof(wordid_t),1, outfile,"n-gram ids"); } rr_fwrite((char*)&temp_count,sizeof(count_t),1,outfile,"n-gram counts"); } if (fof_size > 0 && n>1) { /* Add stuff to fof arrays */ /* Code from idngram2stats */ pos_of_novelty = n; for (i=0;i<=n-1;i++) { if (smallest_ngram[i] > previous_ngram[i]) { pos_of_novelty = i; i=n; } } /* Add new N-gram */ num_kgrams[n-2]++; if (temp_count <= fof_size) fof_array[n-2][temp_count]++; if (!first_ngram) { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) { fof_array[i-1][ng_count[i-1]]++; } ng_count[i-1] = temp_count; } }else { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { ng_count[i-1] = temp_count; } first_ngram = 0; } for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += temp_count; for (i=0;i<=n-1;i++) previous_ngram[i]=smallest_ngram[i]; } } for (i=0;i<=end_file-start_file;i++) { fclose(temp_file[i]); remove(temp_filename[i]); } } if (fof_size > 0 && n>1) { /* Display fof arrays */ /* Process last ngram */ for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) fof_array[i-1][ng_count[i-1]]++; ng_count[i-1] = temp_count; } for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += temp_count; display_fof_array(num_kgrams,fof_array,fof_size,stderr, n); } }
void merge_tempfiles (int start_file, int end_file, char *temp_file_root, char *temp_file_ext, int max_files, FILE *outfile, int n, int verbosity) { FILE *new_temp_file; char *new_temp_filename; FILE **temp_file; char **temp_filename; char **current_ngram; char smallest_ngram[1000]; int *current_ngram_count; flag *finished; flag all_finished; int temp_count; char temp_word[500]; int i,j; pc_message(verbosity,2,"Merging temp files %d through %d...\n", start_file, end_file); /* * If we try to do more than max_files, then merge into groups, * then merge groups recursively. */ if (end_file-start_file+1 > max_files) { int new_start_file, new_end_file; int n_file_groups = 1 + (end_file-start_file)/max_files; fprintf(stderr, "%d files to do, in %d groups\n", end_file-start_file, n_file_groups); new_temp_filename = (char *) rr_malloc(300*sizeof(char)); /* * These n_file_groups sets of files will be done in groups of * max_files batches each, as temp files numbered * end_file+1 ... end_file+n_file_groups, * and then these will be merged into the final result. */ for (i = 0; i < n_file_groups; i++) { /* do files i*max_files through min((i+1)*max_files-1,end_file); */ new_start_file = start_file + (i*max_files); new_end_file = start_file + ((i+1)*max_files) - 1; if (new_end_file > end_file) new_end_file = end_file; sprintf(new_temp_filename, "%s/%hu%s", temp_file_root, end_file+i+1, temp_file_ext); new_temp_file = rr_oopen(new_temp_filename); merge_tempfiles(new_start_file, new_end_file, temp_file_root, temp_file_ext, max_files, new_temp_file, n, verbosity); rr_iclose(new_temp_file); } merge_tempfiles(end_file+1, end_file+n_file_groups, temp_file_root, temp_file_ext, max_files, outfile, n, verbosity); return; } /* * We know we are now doing <= max_files. */ temp_file = (FILE **) rr_malloc((end_file+1)*sizeof(FILE *)); temp_filename = (char **) rr_malloc((end_file+1)*sizeof(char *)); for (i=start_file;i<=end_file;i++) { temp_filename[i] = (char *) rr_malloc(300*sizeof(char)); } current_ngram = (char **) rr_malloc((end_file+1)*sizeof(char *)); for (i=start_file;i<=end_file;i++) { current_ngram[i] = (char *) rr_malloc(1000*sizeof(char)); } current_ngram_count = (int *) rr_malloc((end_file+1)*sizeof(int)); finished = (flag *) rr_malloc(sizeof(flag)*(end_file+1)); /* Open all the temp files for reading */ for (i=start_file;i<=end_file;i++) { sprintf(temp_filename[i],"%s/%hu%s", temp_file_root,i,temp_file_ext); temp_file[i] = rr_iopen(temp_filename[i]); } /* Now go through the files simultaneously, and write out the appropriate ngram counts to the output file. */ for (i=start_file;i<=end_file;i++) { finished[i] = 0; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { if (fscanf(temp_file[i],"%s",temp_word) != 1) { if (!rr_feof(temp_file[i])) quit(-1,"Error reading temp file %s\n",temp_filename[i]); }else { if (j==0) strcpy(current_ngram[i],temp_word); else { strcat(current_ngram[i]," "); strcat(current_ngram[i],temp_word); } } } if (fscanf(temp_file[i],"%d",¤t_ngram_count[i]) != 1) { if (!rr_feof(temp_file[i])) quit(-1,"Error reading temp file %s\n",temp_filename[i]); } } } all_finished = 0; while (!all_finished) { /* Find the smallest current ngram */ strcpy(smallest_ngram,""); for (i=start_file;i<=end_file;i++) { if (!finished[i]) { if (strcmp(smallest_ngram,current_ngram[i]) > 0 || (smallest_ngram[0] == '\0')) strcpy(smallest_ngram,current_ngram[i]); } } /* For each of the files that are currently holding this ngram, add its count to the temporary count, and read in a new ngram from the files. */ temp_count = 0; for (i=start_file;i<=end_file;i++) { if (!finished[i]) { if (!strcmp(smallest_ngram,current_ngram[i])) { temp_count += current_ngram_count[i]; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { if (fscanf(temp_file[i],"%s",temp_word) != 1) { if (!rr_feof(temp_file[i])) { quit(-1,"Error reading temp file %s\n",temp_filename[i]); } }else { if (j==0) strcpy(current_ngram[i],temp_word); else { strcat(current_ngram[i]," "); strcat(current_ngram[i],temp_word); } } } if (fscanf(temp_file[i],"%d",¤t_ngram_count[i]) != 1) { if (!rr_feof(temp_file[i])) { quit(-1,"Error reading temp file count %s\n", temp_filename[i]); } } } /* * PWP: Note that the fscanf may have changed the state of * temp_file[i], so we re-ask the question rather than just * doing an "else". */ if (rr_feof(temp_file[i])) { finished[i] = 1; all_finished = 1; for (j=start_file;j<=end_file;j++) { if (!finished[j]) { all_finished = 0; } } } } } } /* * PWP: We cannot conditionalize this on (!all_finished) because * if we do we may have lost the very last count. (Consider the * case when several files have ran out of data, but the last * couple have the last count in them.) */ if (fprintf(outfile,"%s %d\n",smallest_ngram,temp_count) < 0) { quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n"); } } for (i=start_file;i<=end_file;i++) { rr_iclose(temp_file[i]); remove(temp_filename[i]); } free(temp_file); for (i=start_file;i<=end_file;i++) { free(temp_filename[i]); } free(temp_filename); for (i=start_file;i<=end_file;i++) { free(current_ngram[i]); } free(current_ngram); free(current_ngram_count); free(finished); }
/* @return number_of_tempfiles */ int read_txt2ngram_buffer(FILE* infp, struct idngram_hash_table *vocabulary, int32 verbosity, wordid_t *buffer, int buffer_size, unsigned int n, char* temp_file_root, char* temp_file_ext, FILE* temp_file ) { /* Read text into buffer */ char temp_word[MAX_WORD_LENGTH]; int position_in_buffer; int number_of_tempfiles; unsigned int i,j; wordid_t *placeholder; wordid_t *temp_ngram; int temp_count; #if 1 int tmpval; #endif temp_ngram = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); placeholder = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); ng=n; position_in_buffer = 0; number_of_tempfiles = 0; //tk: looks like things may croak if the corpus has less than n words //not that such a corpus would be useful anyway for (i=0;i<=n-1;i++) { get_word(infp,temp_word); /* fprintf(stderr,"%s \n",temp_word); fprintf(stderr,"%d \n",index2(vocabulary,temp_word)); fflush(stderr); */ add_to_buffer(index2(vocabulary,temp_word),0,i,buffer); } while (!rr_feof(infp)) { /* Fill up the buffer */ pc_message(verbosity,2,"Reading text into the n-gram buffer...\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); while ((position_in_buffer<buffer_size) && (!rr_feof(infp))) { position_in_buffer++; show_idngram_nlines(position_in_buffer,verbosity); for (i=1;i<=n-1;i++) add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer), position_in_buffer,i-1,buffer); if (get_word(infp,temp_word) == 1) { /* fprintf(stderr,"%s \n",temp_word); fprintf(stderr,"%d \n",index2(vocabulary,temp_word)); fflush(stderr); */ add_to_buffer(index2(vocabulary,temp_word),position_in_buffer, n-1,buffer); } } for (i=0;i<=n-1;i++) placeholder[i] = buffer_contents(position_in_buffer,i,buffer); /* Sort buffer */ pc_message(verbosity,2,"\nSorting n-grams...\n"); qsort((void*) buffer,(size_t) position_in_buffer,n*sizeof(wordid_t),compare_ngrams); /* Output the buffer to temporary BINARY file */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s",temp_file_root, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n", temp_word); temp_file = rr_oopen(temp_word); for (i=0;i<=n-1;i++) { temp_ngram[i] = buffer_contents(0,i,buffer); #if MAX_VOCAB_SIZE < 65535 /* This check is well-meaning but completely useless since buffer_contents() can never return something greater than MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */ if (temp_ngram[i] > MAX_VOCAB_SIZE) quit(-1,"Invalid trigram in buffer.\nAborting"); #endif } temp_count = 1; for (i=1;i<=position_in_buffer;i++) { tmpval=compare_ngrams(temp_ngram,&buffer[i*n]); /* for(k=0;k<=n-1;k++){ fprintf(stderr, "tmpval: %d k %d, temp_ngram %d, &buffer[i*n] %d\n",tmpval, k, temp_ngram[k], (&buffer[i*n])[k]); }*/ if (!compare_ngrams(temp_ngram,&buffer[i*n])) temp_count++; else { /* printf("Have been here?\n");*/ for (j=0;j<=n-1;j++) { rr_fwrite((char*) &temp_ngram[j],sizeof(wordid_t),1, temp_file,"temporary n-gram ids"); temp_ngram[j] = buffer_contents(i,j,buffer); } rr_fwrite((char*)&temp_count,sizeof(int),1,temp_file, "temporary n-gram counts"); /* for(j=0 ; j<=n-1;j++) fprintf(stderr,"%d ",temp_ngram[j]); fprintf(stderr,"%d\n",temp_count);*/ temp_count = 1; } } rr_oclose(temp_file); for (i=0;i<=n-1;i++) add_to_buffer(placeholder[i],0,i,buffer); position_in_buffer = 0; } return number_of_tempfiles; }
ng_t * init_ng( int* argc, char **argv, int verbosity ) { int i; ng_t* ng; ng=(ng_t*) rr_calloc(1,sizeof(ng_t)); ng->disc_meth=NULL; /* -n */ ng->n = pc_intarg(argc, argv,"-n",DEFAULT_N); if (ng->n<1) quit(-1,"Error: Value of n must be larger than zero.\n"); /* -cutoffs */ ng->cutoffs = (cutoff_t *) pc_shortarrayarg(argc, argv, "-cutoffs",ng->n-1,ng->n-1); if (ng->cutoffs == NULL) ng->cutoffs = (cutoff_t *) rr_calloc((ng->n-1)+1,sizeof(cutoff_t)); /* +1 for the sake of the correction in writing in write_lms.c */ for (i=0;i<=ng->n-3;i++) { if (ng->cutoffs[i+1] < ng->cutoffs[i]) { quit(-1,"Error - cutoffs for (n+1)-gram must be greater than or equal to those for \nn-gram. You have %d-gram cutoff = %d > %d-gram cutoff = %d.\n",i+2,ng->cutoffs[i],i+3,ng->cutoffs[i+1]); } } /* -min_unicount */ ng->min_unicount = pc_intarg(argc, argv, "-min_unicount",0); /* -idngram */ ng->id_gram_filename = salloc(pc_stringarg(argc, argv,"-idngram","")); if (!strcmp(ng->id_gram_filename,"")) quit(-1,"Error: id ngram file not specified. Use the -idngram flag.\n"); /* -arpa & -bin */ ng->arpa_filename = salloc(pc_stringarg(argc, argv,"-arpa","")); ng->bin_filename = salloc(pc_stringarg(argc, argv,"-binary","")); ng->write_arpa = strcmp("",ng->arpa_filename); ng->write_bin = strcmp("",ng->bin_filename); if (!(ng->write_arpa || ng->write_bin)) quit(-1,"Error : must specify either an arpa, or a binary output file.\n"); ng->count_table_size = DEFAULT_COUNT_TABLE_SIZE; /* -vocab */ ng->vocab_filename = salloc(pc_stringarg(argc,argv,"-vocab","")); if (!strcmp("",ng->vocab_filename)) quit(-1,"Error : vocabulary file not specified. Use the -vocab option.\n"); /* -context */ ng->context_cues_filename = salloc(pc_stringarg(argc,argv,"-context","")); ng->context_set = strcmp("", ng->context_cues_filename); /* -vocab_type */ ng->vocab_type = pc_intarg(argc,argv,"-vocab_type",1); /* -oov_fraction */ ng->oov_fraction = pc_doublearg(argc, argv,"-oov_fraction",-1.0); if (ng->oov_fraction == -1.0) ng->oov_fraction=DEFAULT_OOV_FRACTION; else { if (ng->vocab_type != 2) pc_message(verbosity,1,"Warning : OOV fraction specified, but will not be used, since vocab type is not 2.\n"); } if (ng->vocab_type == 0) ng->first_id = 1; else ng->first_id = 0; /* Allow both "min_alpha" etc and "min_bo_weight" etc as valid syntax. The "bo_weight" form is preferred, but the "alpha" form is maintained as it was present in version 2.00 */ ng->min_alpha = pc_doublearg(argc,argv,"-min_alpha",DEFAULT_MIN_ALPHA); ng->max_alpha = pc_doublearg(argc,argv,"-max_alpha",DEFAULT_MAX_ALPHA); ng->out_of_range_alphas = pc_intarg(argc,argv,"-out_of_range_alphas", DEFAULT_OUT_OF_RANGE_ALPHAS); ng->min_alpha = pc_doublearg(argc,argv,"-min_bo_weight",ng->min_alpha); ng->max_alpha = pc_doublearg(argc,argv,"-max_bo_weight",ng->max_alpha); ng->out_of_range_alphas = pc_intarg(argc,argv,"-out_of_range_bo_weights", ng->out_of_range_alphas); if (ng->min_alpha >= ng->max_alpha) quit(-1,"Error : Minimum of alpha range must be less than the maximum.\n"); init_ng_disc_method(ng, pc_flagarg(argc, argv,"-linear"), pc_flagarg(argc,argv,"-absolute"), pc_flagarg(argc,argv,"-witten_bell"), pc_flagarg(argc,argv,"-good_turing")); ng->disc_range = (unsigned short *) pc_shortarrayarg(argc, argv, "-disc_ranges",ng->n,ng->n); ng->disc_range_set = (ng->disc_range != NULL); if (ng->discounting_method == GOOD_TURING) { if (!ng->disc_range_set) { ng->disc_range = (unsigned short *) rr_malloc(sizeof(unsigned short) * ng->n); ng->disc_range[0] = DEFAULT_DISC_RANGE_1; for (i=1;i<=ng->n-1;i++) ng->disc_range[i] = DEFAULT_DISC_RANGE_REST; } ng->fof_size = (fof_sz_t *) rr_malloc(sizeof(fof_sz_t) * ng->n); for (i=0;i<=ng->n-1;i++) ng->fof_size[i] = ng->disc_range[i]+1; }else { if (ng->disc_range_set) pc_message(verbosity,2,"Warning : discount ranges specified will be ignored, since they only apply\nto Good Turing discounting.\n"); } ng->four_byte_alphas = !(pc_flagarg(argc, argv, "-two_byte_alphas") || pc_flagarg(argc, argv, "-two_byte_bo_weights")); ng->four_byte_counts = pc_flagarg(argc, argv, "-four_byte_counts"); if(ng->four_byte_counts){ pc_message(verbosity,2,"Using Four byte counts.\n"); } ng->zeroton_fraction = pc_doublearg(argc,argv,"-zeroton_fraction",1.0); /* Attempt to open all the files that we will need for input and output. It is better to do it here than to spend a few hours of CPU processing id-gram counts, only to find that the output path is invalid. */ ng->id_gram_fp = rr_iopen(ng->id_gram_filename); /* Vocab is read by Roni's function which does the file opening for us, so no need to do it here. Don't need to worry about time being lost if file doesn't exist, since vocab is first thing to be read anyway. */ if (ng->context_set) ng->context_cues_fp = rr_iopen(ng->context_cues_filename); if (ng->write_arpa) ng->arpa_fp = rr_oopen(ng->arpa_filename); if (ng->write_bin) ng->bin_fp = rr_oopen(ng->bin_filename); return ng; }