void main(int argc, char *argv[]) { int i,j; char *vocab_filename; FILE *tempfile; char tempfiles_directory[1000]; int vocab_size; FILE *vocab_file; int verbosity; int buffer_size; int position_in_buffer; int number_of_tempfiles; int max_files; int fof_size; unsigned short *buffer; unsigned short *placeholder; unsigned short *temp_ngram; int temp_count; char temp_word[500]; char temp_word2[500]; char *temp_file_root; char *temp_file_ext; char *host_name; int proc_id; struct utsname uname_info; flag write_ascii; /* Vocab hash table things */ struct hash_table vocabulary; unsigned long hash_size; unsigned long M; tempfile = NULL; /* Just to prevent compilation warnings. */ report_version(&argc,argv); verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY); /* Process command line */ if (pc_flagarg( &argc, argv,"-help") || argc==1) { fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n"); fprintf(stderr,"Usage : text2idngram -vocab .vocab \n"); fprintf(stderr," [ -buffer 100 ]\n"); fprintf(stderr," [ -hash %d ]\n",DEFAULT_HASH_SIZE); fprintf(stderr," [ -temp %s ]\n",DEFAULT_TEMP); fprintf(stderr," [ -files %d ]\n",DEFAULT_MAX_FILES); fprintf(stderr," [ -gzip | -compress ]\n"); fprintf(stderr," [ -verbosity %d ]\n", DEFAULT_VERBOSITY); fprintf(stderr," [ -n 3 ]\n"); fprintf(stderr," [ -write_ascii ]\n"); fprintf(stderr," [ -fof_size 10 ]\n"); exit(1); } pc_message(verbosity,2,"text2idngram\n"); n = pc_intarg( &argc, argv, "-n",DEFAULT_N); placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n); temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n); hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE); buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM); write_ascii = pc_flagarg(&argc,argv,"-write_ascii"); fof_size = pc_intarg(&argc,argv,"-fof_size",10); max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES); vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" )); if (!strcmp("",vocab_filename)) { quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n"); } strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp", DEFAULT_TEMP)); if (pc_flagarg(&argc,argv,"-compress")) { temp_file_ext = salloc(".Z"); } else { if (pc_flagarg(&argc,argv,"-gzip")) { temp_file_ext = salloc(".gz"); } else { temp_file_ext = salloc(""); } } uname(&uname_info); host_name = salloc(uname_info.nodename); proc_id = getpid(); sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id); temp_file_root = salloc(temp_word); pc_report_unk_args(&argc,argv,verbosity); /* If the last charactor in the directory name isn't a / then add one. */ if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') { strcat(tempfiles_directory,"/"); } pc_message(verbosity,2,"Vocab : %s\n",vocab_filename); pc_message(verbosity,2,"N-gram buffer size : %d\n",buffer_size); pc_message(verbosity,2,"Hash table size : %d\n",hash_size); pc_message(verbosity,2,"Temp directory : %s\n",tempfiles_directory); pc_message(verbosity,2,"Max open files : %d\n",max_files); pc_message(verbosity,2,"FOF size : %d\n",fof_size); pc_message(verbosity,2,"n : %d\n",n); buffer_size *= (1000000/(sizeof(unsigned short)*n)); /* Allocate memory for hash table */ fprintf(stderr,"Initialising hash table...\n"); M = nearest_prime(hash_size); new_hashtable(&vocabulary,M); /* Read in the vocabulary */ vocab_size = 0; vocab_file = rr_iopen(vocab_filename); pc_message(verbosity,2,"Reading vocabulary...\n"); while (fgets (temp_word, sizeof(temp_word),vocab_file)) { if (strncmp(temp_word,"##",2)==0) continue; sscanf (temp_word, "%s ",temp_word2); /* Check for repeated words in the vocabulary */ if (index2(&vocabulary,temp_word2) != 0) { fprintf(stderr,"======================================================\n"); fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word); fprintf(stderr,"=======================================================\n"); } if (strncmp(temp_word,"#",1)==0) { fprintf(stderr,"\n\n===========================================================\n"); fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n"); fprintf(stderr, ">>> %s <<<\n",temp_word); fprintf(stderr, " '%s' will be included in the vocabulary.\n",temp_word2); fprintf(stderr, " (comments must start with '##')\n"); fprintf(stderr,"===========================================================\n\n"); } vocab_size++; add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size); } if (vocab_size > MAX_VOCAB_SIZE) { quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n"); } pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n"); buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short)); number_of_tempfiles = 0; /* Read text into buffer */ /* Read in the first ngram */ position_in_buffer = 0; for (i=0;i<=n-1;i++) { get_word(stdin,temp_word); add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer); } while (!rr_feof(stdin)) { /* Fill up the buffer */ pc_message(verbosity,2,"Reading text into the n-gram buffer...\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) { position_in_buffer++; if (position_in_buffer % 20000 == 0) { if (position_in_buffer % 1000000 == 0) { pc_message(verbosity,2,".\n"); } else { pc_message(verbosity,2,"."); } } for (i=1;i<=n-1;i++) { add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer), position_in_buffer,i-1,buffer); } if (get_word(stdin,temp_word) == 1) { add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer, n-1,buffer); } } for (i=0;i<=n-1;i++) { placeholder[i] = buffer_contents(position_in_buffer,i,buffer); } /* Sort buffer */ pc_message(verbosity,2,"\nSorting n-grams...\n"); qsort((void*) buffer,(size_t) position_in_buffer, n*sizeof(unsigned short),compare_ngrams); /* Output the buffer to temporary BINARY file */ number_of_tempfiles++; sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n", temp_word); tempfile = rr_oopen(temp_word); for (i=0;i<=n-1;i++) { temp_ngram[i] = buffer_contents(0,i,buffer); if (temp_ngram[i] > MAX_VOCAB_SIZE) { quit(-1,"Invalid trigram in buffer.\nAborting"); } } temp_count = 1; for (i=1;i<=position_in_buffer;i++) { if (!compare_ngrams(temp_ngram,&buffer[i*n])) { temp_count++; } else { for (j=0;j<=n-1;j++) { rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1, tempfile,"temporary n-gram ids"); temp_ngram[j] = buffer_contents(i,j,buffer); } rr_fwrite(&temp_count,sizeof(int),1,tempfile, "temporary n-gram counts"); temp_count = 1; } } rr_oclose(tempfile); for (i=0;i<=n-1;i++) { add_to_buffer(placeholder[i],0,i,buffer); } position_in_buffer = 0; } /* Merge the temporary files, and output the result to standard output */ pc_message(verbosity,2,"Merging temporary files...\n"); merge_tempfiles(1, number_of_tempfiles, temp_file_root, temp_file_ext, max_files, tempfiles_directory, stdout, write_ascii, fof_size); pc_message(verbosity,0,"text2idngram : Done.\n"); exit(0); }
void compute_perplexity(ng_t *ng, arpa_lm_t *arpa_ng, char *text_stream_filename, char *probs_stream_filename, char *annotation_filename, char *oov_filename, char *fb_list_filename, flag backoff_from_unk_inc, flag backoff_from_unk_exc, flag backoff_from_ccs_inc, flag backoff_from_ccs_exc, flag arpa_lm, flag include_unks, double log_base) { fb_info *fb_list; FILE *temp_fp; FILE *text_stream_fp; FILE *probs_stream_fp; FILE *annotation_fp; FILE *oov_fp; flag out_probs; flag annotate; flag out_oovs; flag found_unk_wrongly; double prob; double sum_log_prob; int total_words; int excluded_unks; int excluded_ccs; char current_word[1000]; /* Hope that's big enough */ char **prev_words; vocab_sz_t current_id; id__t short_current_id; id__t *context; int context_length; int i; int bo_case; int actual_context_length; int *ngrams_hit; int n; /* Initialise file pointers to prevent warnings from the compiler. */ probs_stream_fp = NULL; annotation_fp = NULL; oov_fp = NULL; short_current_id = 0; found_unk_wrongly = 0; annotate = 0; bo_case = 0; if (arpa_lm) { n = arpa_ng->n; fb_list = gen_fb_list(arpa_ng->vocab_ht, (int) arpa_ng->vocab_size, arpa_ng->vocab, arpa_ng->context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); }else { n = ng->n; fb_list = gen_fb_list(ng->vocab_ht, (int) ng->vocab_size, ng->vocab, ng->context_cue, backoff_from_unk_inc, backoff_from_unk_exc, backoff_from_ccs_inc, backoff_from_ccs_exc, fb_list_filename); } ngrams_hit = (int *) rr_calloc(n,sizeof(int)); prev_words = (char **) rr_malloc(sizeof(char *)*n); for (i=0;i<=n-1;i++) prev_words[i] = (char *) rr_malloc(sizeof(char)*1000); /* Check that text_stream_filename and probs_stream_filename (if specified) are valid. Note that the checks employed by the standard rr_fopen tools are not suitable here, since we don't want the program to terminate if the paths are not found. */ if (!strcmp(text_stream_filename,"")) { printf("Error : Must specify a text file. Use the -text switch.\n"); return; } if (!rr_fexists(text_stream_filename) && strcmp(text_stream_filename,"-")) { printf("Error : Can't open file %s for reading.\n",text_stream_filename); return; } out_probs = strcmp(probs_stream_filename,""); annotate = strcmp(annotation_filename,""); out_oovs = strcmp(oov_filename,""); printf("Computing perplexity of the language model with respect\n"); printf(" to the text %s\n",text_stream_filename); if (out_probs) printf("Probability stream will be written to file %s\n", probs_stream_filename); if (annotate) printf("Annotation will be written to file %s\n", annotation_filename); if (out_oovs) printf("Out of vocabulary words will be written to file %s\n", oov_filename); if (backoff_from_unk_inc) printf("Will force inclusive back-off from OOVs.\n"); if (backoff_from_unk_exc) printf("Will force exclusive back-off from OOVs.\n"); if (backoff_from_ccs_inc) printf("Will force inclusive back-off from context cues.\n"); if (backoff_from_ccs_exc) printf("Will force exclusive back-off from context cues.\n"); if (strcmp(fb_list_filename,"")) printf("Will force back-off according to the contents of %s\n", fb_list_filename); if (include_unks) printf("Perplexity calculation will include OOVs.\n"); /* Check for existance of files, as rr functions will quit, which isn't what we want */ if (out_probs && strcmp(probs_stream_filename,"-")) { if ((temp_fp = fopen(probs_stream_filename,"w")) == NULL) { printf("Error : Can't open file %s for writing.\n",probs_stream_filename); return; } fclose(temp_fp); } if (annotate && strcmp(annotation_filename,"-")) { if ((temp_fp = fopen(annotation_filename,"w")) == NULL) { printf("Error : Can't open file %s for writing.\n",annotation_filename); return; } fclose(temp_fp); } if (out_oovs && strcmp(oov_filename,"-")) { if ((temp_fp = fopen(oov_filename,"w")) == NULL) { printf("Error : Can't open file %s for writing.\n",oov_filename); return; } fclose(temp_fp); } text_stream_fp = rr_iopen(text_stream_filename); if (out_probs) probs_stream_fp = rr_oopen(probs_stream_filename); if (annotate) annotation_fp = rr_oopen(annotation_filename); if (out_oovs) oov_fp = rr_oopen(oov_filename); context = (id__t *) rr_malloc(sizeof(id__t)*(n-1)); sum_log_prob = 0.0; total_words = 0; excluded_unks = 0; excluded_ccs = 0; while (!rr_feof(text_stream_fp)) { if (total_words > 0) { if (total_words < n) strcpy(prev_words[total_words-1],current_word); else { for (i=0;i<=n-3;i++) strcpy(prev_words[i],prev_words[i+1]); if (n>1) strcpy(prev_words[n-2],current_word); } } if (total_words < (n-1)) context_length = total_words; else context_length = n-1; /* Fill context with right stuff */ if (total_words > (n-1)) { for (i=0;i<=context_length-2;i++) context[i] = context[i+1]; } if (context_length != 0) context[context_length-1] = short_current_id; if (fscanf(text_stream_fp,"%s",current_word) != 1) { if (!rr_feof(text_stream_fp)) { printf("Error reading text file.\n"); return; } } if (!rr_feof(text_stream_fp)) { if (arpa_lm) { sih_lookup(arpa_ng->vocab_ht,current_word,¤t_id); if (arpa_ng->vocab_type == CLOSED_VOCAB && current_id == 0) { found_unk_wrongly = 1; printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word); } if (current_id > arpa_ng->vocab_size) quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); else short_current_id = current_id; }else { sih_lookup(ng->vocab_ht,current_word,¤t_id); if (ng->vocab_type == CLOSED_VOCAB && current_id == 0) { found_unk_wrongly = 1; printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word); } if (current_id > ng->vocab_size) quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); else short_current_id = current_id; } if (!found_unk_wrongly) { if (current_id == 0 && out_oovs) fprintf(oov_fp,"%s\n",current_word); if ((arpa_lm && (!(arpa_ng->context_cue[current_id]))) || ((!arpa_lm) && (!(ng->context_cue[current_id])))) { if (include_unks || current_id != 0) { prob = calc_prob_of(short_current_id, context, context_length, ng, arpa_ng, fb_list, &bo_case, &actual_context_length, arpa_lm); if (prob<= 0.0 || prob > 1.0) { fprintf(stderr,"Warning : "); if (short_current_id == 0) fprintf(stderr,"P( <UNK> | "); else fprintf(stderr,"P( %s | ",current_word); for (i=0;i<=actual_context_length-1;i++) { if (context[i+context_length-actual_context_length] == 0) fprintf(stderr,"<UNK> "); else fprintf(stderr,"%s ",prev_words[i]); } fprintf(stderr,") = %g logprob = %g \n ",prob,log(prob)/log(log_base)); fprintf(stderr,"bo_case == 0x%dx, actual_context_length == %d\n", bo_case, actual_context_length); } if (annotate) { if (short_current_id == 0) fprintf(annotation_fp,"P( <UNK> | "); else fprintf(annotation_fp,"P( %s | ",current_word); for (i=0;i<=actual_context_length-1;i++) { if (context[i+context_length-actual_context_length] == 0) fprintf(annotation_fp,"<UNK> "); else { if (arpa_lm) fprintf(annotation_fp,"%s ",arpa_ng->vocab[context[i+context_length-actual_context_length]]); else fprintf(annotation_fp,"%s ",ng->vocab[context[i+context_length-actual_context_length]]); } } fprintf(annotation_fp,") = %g logprob = %f bo_case = ",prob,log(prob)/log(log_base)); decode_bo_case(bo_case,actual_context_length,annotation_fp); } /* Calculate level to which we backed off */ for (i=actual_context_length-1;i>=0;i--) { int four_raise_i = 1<<(2*i); /* PWP */ /* * PWP: This was "if ((bo_case / (int) pow(3,i)) == 0)" * but was getting a divide-by-zero error on an Alpha * (it isn't clear to me why it should ever have done so) * Anyway, it is much faster to do in base-4. */ if ((bo_case == 0) || ((bo_case / four_raise_i) == 0)) { ngrams_hit[i+1]++; i = -2; }else bo_case -= ((bo_case / four_raise_i) * four_raise_i); } if (i != -3) ngrams_hit[0]++; if (out_probs) fprintf(probs_stream_fp,"%g\n",prob); sum_log_prob += log10(prob); } if (current_id == 0 && !include_unks) excluded_unks++; } else { if (((!arpa_lm) && ng->context_cue[current_id]) || (arpa_lm && arpa_ng->context_cue[current_id])) excluded_ccs++; } total_words++; } } } if (!found_unk_wrongly) { /* pow(x,y) = e**(y ln(x)) */ printf("Perplexity = %.2f, Entropy = %.2f bits\n", exp(-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * log(10.0)), (-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * log(10.0) / log(2.0))); printf("Computation based on %d words.\n", total_words-excluded_ccs-excluded_unks); for(i=n;i>=1;i--) { printf("Number of %d-grams hit = %d (%.2f%%)\n",i,ngrams_hit[i-1], (float) 100*ngrams_hit[i-1]/(total_words-excluded_ccs-excluded_unks) ); } printf("%d OOVs (%.2f%%) and %d context cues were removed from the calculation.\n", excluded_unks, (float) 100*excluded_unks/(total_words-excluded_ccs),excluded_ccs); } rr_iclose(text_stream_fp); if (out_probs) rr_oclose(probs_stream_fp); if (annotate) rr_oclose(annotation_fp); if (out_oovs) rr_oclose(oov_fp); free (fb_list); free (context); free (ngrams_hit); }
int main (int argc, char **argv) { int n; int verbosity; int max_files; int max_words; int max_chars; int current_word; int current_char; int start_char; /* start boundary (possibly > than 0) */ int no_of_spaces; int pos_in_string; int i; char *current_string; char current_temp_filename[500]; int current_file_number; FILE *temp_file; flag text_buffer_full; char *text_buffer; char **pointers; char current_ngram[500]; int current_count; int counter; char temp_directory[1000]; char *temp_file_ext; flag words_set; flag chars_set; /* Process command line */ verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY); pc_message(verbosity,2,"text2wngram\n"); report_version(&argc,argv); if (pc_flagarg( &argc, argv,"-help")) { help_message(); exit(1); } n = pc_intarg(&argc, argv,"-n",DEFAULT_N); /* max_words = pc_intarg(&argc, argv,"-words",STD_MEM*1000000/11); max_chars = pc_intarg(&argc, argv,"-chars",STD_MEM*7000000/11); */ max_words = pc_intarg(&argc, argv,"-words",-1); max_chars = pc_intarg(&argc, argv,"-chars",-1); if (max_words == -1) { words_set = 0; max_words = STD_MEM*1000000/11; }else words_set = 1; if (max_chars == -1) { chars_set = 0; max_chars = STD_MEM*7000000/11; }else chars_set = 1; max_files = pc_intarg(&argc, argv,"-files",DEFAULT_MAX_FILES); if (pc_flagarg(&argc,argv,"-compress")) temp_file_ext = salloc(".Z"); else { if (pc_flagarg(&argc,argv,"-gzip")) temp_file_ext = salloc(".gz"); else temp_file_ext = salloc(""); } strcpy(temp_directory, "cmuclmtk-XXXXXX"); if (mkdtemp(temp_directory) == NULL) { quit(-1, "Failed to create temporary folder: %s\n", strerror(errno)); } pc_report_unk_args(&argc,argv,verbosity); if (words_set && !chars_set) max_chars = max_words * 7; if (!words_set && chars_set) max_words = max_chars / 7; /* If the last charactor in the directory name isn't a / then add one. */ pc_message(verbosity,2,"n = %d\n",n); pc_message(verbosity,2,"Number of words in buffer = %d\n",max_words); pc_message(verbosity,2,"Number of chars in buffer = %d\n",max_chars); pc_message(verbosity,2,"Max number of files open at once = %d\n",max_files); pc_message(verbosity,2,"Temporary directory = %s\n",temp_directory); /* Allocate memory for the buffers */ text_buffer = (char *) rr_malloc(sizeof(char)*max_chars); pc_message(verbosity,2,"Allocated %d bytes to text buffer.\n", sizeof(char)*max_chars); pointers = (char **) rr_malloc(sizeof(char *)*max_words); pc_message(verbosity,2,"Allocated %d bytes to pointer array.\n", sizeof(char *)*max_words); current_file_number = 0; current_word = 1; start_char = 0; current_char = 0; counter = 0; pointers[0] = text_buffer; while (!feof(stdin)) { current_file_number++; /* Read text into buffer */ pc_message(verbosity,2,"Reading text into buffer...\n"); pc_message(verbosity,2,"Reading text into the n-gram buffer...\n"); pc_message(verbosity,2,"20,000 words processed for each \".\", 1,000,000 for each line.\n"); pointers[0] = text_buffer; while ((!rr_feof(stdin)) && (current_word < max_words) && (current_char < max_chars)) { text_buffer[current_char] = getchar(); if (text_buffer[current_char] == '\n' || text_buffer[current_char] == '\t' ) { text_buffer[current_char] = ' '; } if (text_buffer[current_char] == ' ') { if (current_char > start_char) { if (text_buffer[current_char-1] == ' ') { current_word--; current_char--; } pointers[current_word] = &(text_buffer[current_char+1]); current_word++; counter++; if (counter % 20000 == 0) { if (counter % 1000000 == 0) pc_message(verbosity,2,"\n"); else pc_message(verbosity,2,"."); } } } if (text_buffer[current_char] != ' ' || current_char > start_char) current_char++; } text_buffer[current_char]='\0'; if (current_word == max_words || rr_feof(stdin)) { for (i=current_char+1;i<=max_chars-1;i++) text_buffer[i] = ' '; text_buffer_full = 0; }else text_buffer_full = 1; /* Sort buffer */ pc_message(verbosity,2,"\nSorting pointer array...\n"); qsort((void *) pointers,(size_t) current_word-n,sizeof(char *),cmp_strings); /* Write out temporary file */ sprintf(current_temp_filename,"%s/%hu%s",temp_directory, current_file_number, temp_file_ext); pc_message(verbosity,2,"Writing out temporary file %s...\n",current_temp_filename); temp_file = rr_oopen(current_temp_filename); text_buffer[current_char] = ' '; current_count = 0; strcpy(current_ngram,""); for (i = 0; i <= current_word-n; i++) { current_string = pointers[i]; /* Find the nth space */ no_of_spaces = 0; pos_in_string = 0; while (no_of_spaces < n) { if (current_string[pos_in_string] == ' ') no_of_spaces++; pos_in_string++; } if (!strncmp(current_string,current_ngram,pos_in_string)) current_count++; else { if (strcmp(current_ngram,"")) if (fprintf(temp_file,"%s %d\n",current_ngram,current_count) < 0) quit(-1,"Error writing to temporary file %s\n",current_temp_filename); current_count = 1; strncpy(current_ngram,current_string,pos_in_string); current_ngram[pos_in_string] = '\0'; } } rr_oclose(temp_file); /* Move the last n-1 words to the beginning of the buffer, and set correct current_word and current_char things */ strcpy(text_buffer,pointers[current_word-n]); pointers[0]=text_buffer; /* Find the (n-1)th space */ no_of_spaces=0; pos_in_string=0; if (!text_buffer_full){ while (no_of_spaces<(n-1)) { if (pointers[0][pos_in_string]==' ') { no_of_spaces++; pointers[no_of_spaces] = &pointers[0][pos_in_string+1]; } pos_in_string++; } }else { while (no_of_spaces<n) { if (pointers[0][pos_in_string]==' ') { no_of_spaces++; pointers[no_of_spaces] = &pointers[0][pos_in_string+1]; } pos_in_string++; } pos_in_string--; } current_char = pos_in_string; current_word = n; /* mark boundary beyond which counting pass cannot backup */ start_char = current_char; } /* Merge temporary files */ pc_message(verbosity,2,"Merging temporary files...\n"); merge_tempfiles(1, current_file_number, temp_directory, temp_file_ext, max_files, stdout, n, verbosity); rmdir(temp_directory); pc_message(verbosity,0,"text2wngram : Done.\n"); return 0; }
int main(int argc, char *argv[]) { int verbosity; int vocab_size; FILE *vocab_file; int buffer_size; flag write_ascii; int max_files; int number_of_tempfiles; char *vocab_filename; char *idngram_filename; char temp_word[MAX_WORD_LENGTH]; char temp_word2[MAX_WORD_LENGTH]; char temp_word3[MAX_WORD_LENGTH]; flag contains_unks; int position_in_buffer; FILE *outfile; FILE *tempfile; FILE *non_unk_fp; ngram_rec *buffer; flag same_ngram; int i; int j; int fof_size; int size_of_rec; char temp_directory[1000]; char *temp_file_ext; /* Vocab hash table things */ struct idngram_hash_table vocabulary; unsigned long hash_size; unsigned long M; wordid_t *current_ngram; int current_count; wordid_t *sort_ngram; int sort_count; /* Process command line */ report_version(&argc,argv); if (argc == 1 || pc_flagarg(&argc, argv,"-help")) { /* Display help message */ help_message(); exit(1); } n = pc_intarg( &argc, argv, "-n",DEFAULT_N); hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE); buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM); write_ascii = pc_flagarg(&argc,argv,"-write_ascii"); verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY); max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES); fof_size = pc_intarg(&argc,argv,"-fof_size",10); vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" )); idngram_filename = salloc(pc_stringarg( &argc, argv, "-idngram", "" )); if (!strcmp("",vocab_filename)) quit(-1,"Error : Must specify a vocabulary file.\n"); if (!strcmp("",idngram_filename)) quit(-1,"text2idngram : Error : Must specify idngram file.\n"); if (pc_flagarg(&argc,argv,"-compress")) temp_file_ext = salloc(".Z"); else { if (pc_flagarg(&argc,argv,"-gzip")) temp_file_ext = salloc(".gz"); else temp_file_ext = salloc(""); } strcpy(temp_directory, "cmuclmtk-XXXXXX"); if (mkdtemp(temp_directory) == NULL) { quit(-1, "Failed to create temporary folder: %s\n", strerror(errno)); } pc_report_unk_args(&argc,argv,verbosity); outfile = rr_fopen(idngram_filename,"wb"); pc_message(verbosity,2,"Vocab : %s\n",vocab_filename); pc_message(verbosity,2,"Output idngram : %s\n",idngram_filename); pc_message(verbosity,2,"Buffer size : %d\n",buffer_size); pc_message(verbosity,2,"Hash table size : %d\n",hash_size); pc_message(verbosity,2,"Max open files : %d\n",max_files); pc_message(verbosity,2,"n : %d\n",n); pc_message(verbosity,2,"FOF size : %d\n",fof_size); size_of_rec = (sizeof(wordid_t) * n) + 16 - (( n* sizeof(wordid_t)) % 16); buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec))); fprintf(stderr,"buffer size = %d\n",buffer_size); /* Allocate memory for hash table */ fprintf(stderr,"Initialising hash table...\n"); M = nearest_prime(hash_size); new_idngram_hashtable(&vocabulary,M); /* Read in the vocabulary */ vocab_size = 0; vocab_file = rr_iopen(vocab_filename); pc_message(verbosity,2,"Reading vocabulary...\n"); while (fgets (temp_word, sizeof(temp_word),vocab_file)) { if (strncmp(temp_word,"##",2)==0) continue; sscanf (temp_word, "%s ",temp_word2); /* Check for vocabulary order */ if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n"); /* Check for repeated words in the vocabulary */ if (index2(&vocabulary,temp_word2) != 0) warn_on_repeated_words(temp_word); warn_on_wrong_vocab_comments(temp_word); vocab_size++; add_to_idngram_hashtable(&vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size); strcpy(temp_word3,temp_word2); } if (vocab_size > MAX_VOCAB_SIZE) quit(-1,"Error : Vocabulary size exceeds maximum.\n"); pc_message(verbosity,2,"Allocating memory for the buffer...\n"); buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec)); for (i=0;i<=buffer_size;i++) buffer[i].word = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); /* Open the "non-OOV" tempfile */ sprintf(temp_word, "%s/1%s", temp_directory, temp_file_ext); non_unk_fp = rr_fopen(temp_word,"w"); pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n", temp_word); number_of_tempfiles = 1; current_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); sort_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); /* Read text into buffer */ position_in_buffer = 0; while (!rr_feof(stdin)) { for (i=0;i<=n-1;i++) { get_word(stdin,temp_word); current_ngram[i]=index2(&vocabulary,temp_word); } if (scanf("%d",¤t_count) != 1) if (!rr_feof(stdin)) quit(-1,"Error reading n-gram count from stdin.\n"); if (!rr_feof(stdin)) { contains_unks = 0; for (i=0;i<=n-1;i++) { if (!current_ngram[i]) contains_unks = 1; } if (contains_unks) { /* Write to buffer */ position_in_buffer++; if (position_in_buffer >= buffer_size) { /* Sort buffer */ pc_message(verbosity,2, "Sorting n-grams which include an OOV word...\n"); qsort((void*) buffer,(size_t) position_in_buffer, sizeof(ngram_rec),compare_ngrams2); pc_message(verbosity,2,"Done.\n"); /* Write buffer to temporary file */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s", temp_directory, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2, "Writing sorted OOV-counts buffer to temporary file %s\n", temp_word); tempfile = rr_fopen(temp_word,"w"); for (i=0;i<=n-1;i++) sort_ngram[i] = buffer[0].word[i]; sort_count = buffer[0].count; for (i=0;i<=position_in_buffer-2;i++) { same_ngram = 1; for (j=n-1;j>=0;j--) { if (buffer[i].word[j] != sort_ngram[j]) { same_ngram = 0; j = -1; } } if (same_ngram) sort_count += buffer[i].count; else { for (j=0;j<=n-1;j++) { rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); sort_ngram[j] = buffer[i].word[j]; } rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); sort_count = buffer[i].count; } } for (j=0;j<=n-1;j++) rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); rr_oclose(tempfile); position_in_buffer = 1; } for (i=0;i<=n-1;i++) buffer[position_in_buffer-1].word[i] = current_ngram[i]; buffer[position_in_buffer-1].count = current_count; }else { /* Write to temporary file */ for (i=0;i<=n-1;i++) rr_fwrite((char*)¤t_ngram[i],sizeof(wordid_t),1, non_unk_fp,"temporary n-gram ids"); rr_fwrite((char*)¤t_count,sizeof(int),1,non_unk_fp, "temporary n-gram counts"); } } } if (position_in_buffer > 0) { /* Only do this bit if we have actually seen some OOVs */ /* Sort final buffer */ pc_message(verbosity,2,"Sorting final buffer...\n"); qsort((void*) buffer,(size_t) position_in_buffer, sizeof(ngram_rec),compare_ngrams2); /* Write final buffer */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s", temp_directory, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2,"Writing sorted buffer to temporary file %s\n", temp_word); tempfile = rr_fopen(temp_word,"w"); for (i=0;i<=n-1;i++) sort_ngram[i] = buffer[0].word[i]; sort_count = buffer[0].count; for (i=1;i<=position_in_buffer-1;i++) { same_ngram = 1; for (j=n-1;j>=0;j--) { if (buffer[i].word[j] != sort_ngram[j]) { same_ngram = 0; j = -1; } } if (same_ngram) sort_count += buffer[i].count; else { for (j=0;j<=n-1;j++) { rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); sort_ngram[j] = buffer[i].word[j]; } rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); sort_count = buffer[i].count; } } for (j=0;j<=n-1;j++) rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); fclose(tempfile); } /* Merge the temporary files, and output the result */ fclose(non_unk_fp); pc_message(verbosity,2,"Merging temporary files...\n"); merge_idngramfiles(1, number_of_tempfiles, temp_directory, temp_file_ext, max_files, outfile, write_ascii, fof_size, n); fclose(outfile); rmdir(temp_directory); pc_message(verbosity,0,"wngram2idngram : Done.\n"); return 0; }
/* @return number_of_tempfiles */ int read_txt2ngram_buffer(FILE* infp, struct idngram_hash_table *vocabulary, int32 verbosity, wordid_t *buffer, int buffer_size, unsigned int n, char* temp_file_root, char* temp_file_ext, FILE* temp_file ) { /* Read text into buffer */ char temp_word[MAX_WORD_LENGTH]; int position_in_buffer; int number_of_tempfiles; unsigned int i,j; wordid_t *placeholder; wordid_t *temp_ngram; int temp_count; #if 1 int tmpval; #endif temp_ngram = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); placeholder = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); ng=n; position_in_buffer = 0; number_of_tempfiles = 0; //tk: looks like things may croak if the corpus has less than n words //not that such a corpus would be useful anyway for (i=0;i<=n-1;i++) { get_word(infp,temp_word); /* fprintf(stderr,"%s \n",temp_word); fprintf(stderr,"%d \n",index2(vocabulary,temp_word)); fflush(stderr); */ add_to_buffer(index2(vocabulary,temp_word),0,i,buffer); } while (!rr_feof(infp)) { /* Fill up the buffer */ pc_message(verbosity,2,"Reading text into the n-gram buffer...\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); while ((position_in_buffer<buffer_size) && (!rr_feof(infp))) { position_in_buffer++; show_idngram_nlines(position_in_buffer,verbosity); for (i=1;i<=n-1;i++) add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer), position_in_buffer,i-1,buffer); if (get_word(infp,temp_word) == 1) { /* fprintf(stderr,"%s \n",temp_word); fprintf(stderr,"%d \n",index2(vocabulary,temp_word)); fflush(stderr); */ add_to_buffer(index2(vocabulary,temp_word),position_in_buffer, n-1,buffer); } } for (i=0;i<=n-1;i++) placeholder[i] = buffer_contents(position_in_buffer,i,buffer); /* Sort buffer */ pc_message(verbosity,2,"\nSorting n-grams...\n"); qsort((void*) buffer,(size_t) position_in_buffer,n*sizeof(wordid_t),compare_ngrams); /* Output the buffer to temporary BINARY file */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s",temp_file_root, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n", temp_word); temp_file = rr_oopen(temp_word); for (i=0;i<=n-1;i++) { temp_ngram[i] = buffer_contents(0,i,buffer); #if MAX_VOCAB_SIZE < 65535 /* This check is well-meaning but completely useless since buffer_contents() can never return something greater than MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */ if (temp_ngram[i] > MAX_VOCAB_SIZE) quit(-1,"Invalid trigram in buffer.\nAborting"); #endif } temp_count = 1; for (i=1;i<=position_in_buffer;i++) { tmpval=compare_ngrams(temp_ngram,&buffer[i*n]); /* for(k=0;k<=n-1;k++){ fprintf(stderr, "tmpval: %d k %d, temp_ngram %d, &buffer[i*n] %d\n",tmpval, k, temp_ngram[k], (&buffer[i*n])[k]); }*/ if (!compare_ngrams(temp_ngram,&buffer[i*n])) temp_count++; else { /* printf("Have been here?\n");*/ for (j=0;j<=n-1;j++) { rr_fwrite((char*) &temp_ngram[j],sizeof(wordid_t),1, temp_file,"temporary n-gram ids"); temp_ngram[j] = buffer_contents(i,j,buffer); } rr_fwrite((char*)&temp_count,sizeof(int),1,temp_file, "temporary n-gram counts"); /* for(j=0 ; j<=n-1;j++) fprintf(stderr,"%d ",temp_ngram[j]); fprintf(stderr,"%d\n",temp_count);*/ temp_count = 1; } } rr_oclose(temp_file); for (i=0;i<=n-1;i++) add_to_buffer(placeholder[i],0,i,buffer); position_in_buffer = 0; } return number_of_tempfiles; }
void write_bin_lm(ng_t *ng,int verbosity) { int l_chunk; int from_rec; int i; pc_message(verbosity,1,"Binary %d-gram language model will be written to %s\n",ng->n,ng->bin_filename); ng->version = BBO_FILE_VERSION; /* Scalar parameters */ rr_fwrite((char*)&ng->version,sizeof(int),1,ng->bin_fp,"version"); rr_fwrite((char*)&ng->n,sizeof(unsigned short),1,ng->bin_fp,"n"); rr_fwrite((char*)&ng->vocab_size,sizeof(wordid_t),1,ng->bin_fp,"vocab_size"); rr_fwrite((char*)&ng->no_of_ccs,sizeof(unsigned short),1,ng->bin_fp,"no_of_ccs"); rr_fwrite((char*)&ng->vocab_type,sizeof(unsigned short),1,ng->bin_fp,"vocab_type"); rr_fwrite((char*)&ng->count_table_size,sizeof(count_ind_t),1,ng->bin_fp,"count_table_size"); rr_fwrite((char*)&ng->discounting_method,sizeof(unsigned short),1,ng->bin_fp,"discounting_method"); rr_fwrite((char*)&ng->min_alpha,sizeof(double),1,ng->bin_fp,"min_alpha"); rr_fwrite((char*)&ng->max_alpha,sizeof(double),1,ng->bin_fp,"max_alpha"); rr_fwrite((char*)&ng->out_of_range_alphas,sizeof(unsigned short),1,ng->bin_fp,"out_of_range_alphas"); rr_fwrite((char*)&ng->size_of_alpha_array,sizeof(unsigned short),1,ng->bin_fp,"size_of_alpha_array"); rr_fwrite((char*)&ng->n_unigrams,sizeof(ngram_sz_t),1,ng->bin_fp,"n_unigrams"); rr_fwrite((char*)&ng->zeroton_fraction,sizeof(double),1,ng->bin_fp,"zeroton_fraction"); rr_fwrite((char*)&ng->oov_fraction,sizeof(double),1,ng->bin_fp,"oov_fraction"); rr_fwrite((char*)&ng->four_byte_counts,sizeof(flag),1,ng->bin_fp,"four_byte_counts"); rr_fwrite((char*)&ng->four_byte_alphas,sizeof(flag),1,ng->bin_fp,"four_byte_alphas"); rr_fwrite((char*)&ng->first_id,sizeof(unsigned short),1, ng->bin_fp,"first_id"); /* Short and shortish arrays */ sih_val_write_to_file(ng->vocab_ht,ng->bin_fp,ng->bin_filename,0); /* (ng->vocab is not stored in file - will be derived from ng->vocab_ht) */ if (ng->four_byte_counts==1) { assert(ng->marg_counts4); rr_fwrite((char*)ng->marg_counts4,sizeof(count_t), ng->vocab_size+1,ng->bin_fp,"marg_counts"); }else { assert(ng->marg_counts); rr_fwrite((char*)ng->marg_counts,sizeof(count_ind_t), ng->vocab_size+1,ng->bin_fp,"marg_counts"); } rr_fwrite((char*)ng->alpha_array,sizeof(double), ng->size_of_alpha_array,ng->bin_fp,"alpha_array"); if (!ng->four_byte_counts) { for (i=0;i<=ng->n-1;i++) rr_fwrite((char*)ng->count_table[i],sizeof(count_t), ng->count_table_size+1,ng->bin_fp,"count_table"); } /* Could write count_table as one block, but better to be safe and do it in chunks. For motivation, see comments about writing tree info. */ rr_fwrite((char*)ng->ptr_table_size,sizeof(ptr_tab_sz_t),ng->n,ng->bin_fp,"ptr_table_size"); for (i=0;i<=ng->n-1;i++) rr_fwrite((char*)ng->ptr_table[i],sizeof(ptr_tab_t),ng->ptr_table_size[i],ng->bin_fp,"ptr_table"); /* Unigram statistics */ rr_fwrite((char*)ng->uni_probs,sizeof(uni_probs_t), ng->vocab_size+1, ng->bin_fp,"uni_probs"); rr_fwrite((char*)ng->uni_log_probs,sizeof(uni_probs_t),ng->vocab_size+1, ng->bin_fp,"uni_log_probs"); rr_fwrite((char*)ng->context_cue,sizeof(flag),ng->vocab_size+1, ng->bin_fp,"context_cue"); rr_fwrite((char*)ng->cutoffs,sizeof(cutoff_t),ng->n,ng->bin_fp,"cutoffs"); switch (ng->discounting_method) { case GOOD_TURING: rr_fwrite((char*)ng->fof_size,sizeof(fof_sz_t),ng->n,ng->bin_fp,"fof_size"); rr_fwrite((char*)ng->disc_range,sizeof(unsigned short),ng->n, ng->bin_fp,"disc_range"); for (i=0;i<=ng->n-1;i++) { rr_fwrite((char*)ng->freq_of_freq[i],sizeof(fof_t), ng->fof_size[i]+1,ng->bin_fp,"freq_of_freq"); } for (i=0;i<=ng->n-1;i++) { rr_fwrite((char*)ng->gt_disc_ratio[i],sizeof(disc_val_t), ng->disc_range[i]+1,ng->bin_fp,"gt_disc_ratio"); } case WITTEN_BELL: break; case LINEAR: rr_fwrite((char*)ng->lin_disc_ratio,sizeof(disc_val_t), ng->n,ng->bin_fp,"lin_disc_ratio"); break; case ABSOLUTE: rr_fwrite((char*)ng->abs_disc_const,sizeof(double), ng->n,ng->bin_fp,"abs_disc_const"); break; } /* Tree information */ /* Unigram stuff first, since can be dumped all in one go */ rr_fwrite((char*)ng->num_kgrams,sizeof(ngram_sz_t),ng->n,ng->bin_fp,"num_kgrams"); if (ng->four_byte_counts) rr_fwrite((char*)ng->count4[0],sizeof(count_t),ng->vocab_size+1, ng->bin_fp,"unigram counts"); else rr_fwrite((char*)ng->count[0],sizeof(count_ind_t),ng->vocab_size+1, ng->bin_fp,"unigram counts"); if (ng->four_byte_alphas) rr_fwrite((char*)ng->bo_weight4[0],sizeof(four_byte_t),ng->vocab_size+1, ng->bin_fp,"unigram backoff weights"); else rr_fwrite((char*)ng->bo_weight[0],sizeof(bo_weight_t),ng->vocab_size+1, ng->bin_fp,"unigram backoff weights"); if (ng->n > 1) rr_fwrite((char*)ng->ind[0],sizeof(index__t),ng->vocab_size+1, ng->bin_fp,"unigram -> bigram pointers"); /* Write the rest of the tree structure in chunks, otherwise the kernel buffers are too big. */ /* Need to do byte swapping */ swap_struct(ng); for (i=1;i<=ng->n-1;i++) { from_rec = 0; l_chunk = 100000; while(from_rec < ng->num_kgrams[i]) { if (from_rec+l_chunk > ng->num_kgrams[i]) l_chunk = ng->num_kgrams[i] - from_rec; rr_fwrite((char*)&ng->word_id[i][from_rec],1,sizeof(id__t)*l_chunk,ng->bin_fp,"word ids"); from_rec += l_chunk; } } for (i=1;i<=ng->n-1;i++) { from_rec = 0; l_chunk = 100000; while(from_rec < ng->num_kgrams[i]) { if (from_rec+l_chunk > ng->num_kgrams[i]) l_chunk = ng->num_kgrams[i] - from_rec; if (ng->four_byte_counts) rr_fwrite((char*)&ng->count4[i][from_rec],1,sizeof(count_t)*l_chunk,ng->bin_fp,"counts"); else rr_fwrite((char*)&ng->count[i][from_rec],1,sizeof(count_ind_t)*l_chunk,ng->bin_fp,"counts"); from_rec += l_chunk; } } for (i=1;i<=ng->n-2;i++) { from_rec = 0; l_chunk = 100000; while(from_rec < ng->num_kgrams[i]) { if (from_rec+l_chunk > ng->num_kgrams[i]) l_chunk = ng->num_kgrams[i] - from_rec; if (ng->four_byte_alphas) rr_fwrite((char*)&ng->bo_weight4[i][from_rec],1,sizeof(four_byte_t)*l_chunk, ng->bin_fp,"backoff weights"); else rr_fwrite((char*)&ng->bo_weight[i][from_rec],1,sizeof(bo_weight_t)*l_chunk, ng->bin_fp,"backoff weights"); from_rec += l_chunk; } } for (i=1;i<=ng->n-2;i++) { from_rec = 0; l_chunk = 100000; while(from_rec < ng->num_kgrams[i]) { if (from_rec+l_chunk > ng->num_kgrams[i]) l_chunk = ng->num_kgrams[i] - from_rec; rr_fwrite((char*)&ng->ind[i][from_rec],1,sizeof(index__t)*l_chunk,ng->bin_fp, "indices"); from_rec += l_chunk; } } rr_oclose(ng->bin_fp); /* Swap back */ swap_struct(ng); }
void write_arpa_lm(ng_t *ng,int verbosity) { int *current_pos; int *end_pos; ngram_sz_t i; double log_10_of_e = 1.0 / log(10.0); /* HEADER */ pc_message(verbosity,1,"ARPA-style %d-gram will be written to %s\n",ng->n,ng->arpa_filename); write_arpa_copyright(ng->arpa_fp,ng->n,ng->vocab_size, ng->vocab[1],ng->vocab[2],ng->vocab[3]); display_vocabtype(ng->vocab_type,ng->oov_fraction, ng->arpa_fp); display_discounting_method(ng,ng->arpa_fp); write_arpa_format(ng->arpa_fp,ng->n); write_arpa_num_grams(ng->arpa_fp,ng,NULL,0); write_arpa_k_gram_header(ng->arpa_fp,1); for (i=ng->first_id; i<= (int) ng->vocab_size;i++) { double log10_uniprob; double log10_alpha; double alpha; log10_uniprob = ng->uni_log_probs[i]*log_10_of_e; if (ng->uni_probs[i]<=0.0) log10_uniprob = BAD_LOG_PROB; alpha=ng_double_alpha(ng,0,i); if(alpha > 0.0) log10_alpha = log10(alpha); else log10_alpha = BAD_LOG_PROB; fprintf(ng->arpa_fp,"%.4f %s",log10_uniprob,ng->vocab[i]); if (ng->n>1) fprintf(ng->arpa_fp,"\t%.4f\n",log10_alpha); else fprintf(ng->arpa_fp,"\n"); } current_pos = (int *) rr_malloc(ng->n*sizeof(int)); end_pos = (int *) rr_malloc(ng->n*sizeof(int)); /* Print 2-gram, ... (n-1)-gram info. */ for (i=1;i<=ng->n-1;i++) { /* Print out the (i+1)-gram */ int current_table, j; count_t ngcount, marg_count; double discounted_ngcount; double ngprob, log_10_ngprob, ngalpha, log_10_ngalpha; /* Initialise variables for the sake of warning-free compilation */ #ifdef STATICANALYZEDEPENDENCIES #define __clang_analyzer__ 1 #endif #if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES) #undef __clang_analyzer__ discounted_ngcount = 0.0; log_10_ngalpha = 0.0; #endif write_arpa_k_gram_header(ng->arpa_fp,i+1); /* Go through the n-gram list in order */ for (j=0;j<=ng->n-1;j++) { current_pos[j] = 0; end_pos[j] = 0; } for (current_pos[0]=ng->first_id; current_pos[0]<=(int) ng->vocab_size; current_pos[0]++) { if (return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]) > 0) { current_table = 1; if (current_pos[0] == (int) ng->vocab_size) end_pos[1] = (int ) ng->num_kgrams[1]-1; else { end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1], ng->ptr_table[0], ng->ptr_table_size[0], current_pos[0]+1)-1; } while (current_table > 0) { /* fprintf(stderr, "i %d, current_pos[i] %d, end_pos[i] %d\n", i, current_pos[i], end_pos[i]); fflush(stderr);*/ if (current_table == i) { if (current_pos[i] <= end_pos[i]) { /* fprintf(stderr, "%d\n",ng->count[i][current_pos[i]]); fprintf(stderr, "%d\n",ng->count_table[i][ng->count[i][current_pos[i]]]);*/ ngcount = return_count(ng->four_byte_counts, ng->count_table[i], ng->count[i], ng->count4[i], current_pos[i]); if (i==1) { marg_count = return_count(ng->four_byte_counts, ng->count_table[0], ng->marg_counts, ng->marg_counts4, current_pos[0]); }else { marg_count = return_count(ng->four_byte_counts, ng->count_table[i-1], ng->count[i-1], ng->count4[i-1], current_pos[i-1]); } if(ng->disc_meth==NULL) ng->disc_meth=(disc_meth_t*) disc_meth_init(ng->discounting_method); assert(ng->disc_meth); discounted_ngcount = NG_DISC_METH(ng)->dump_discounted_ngram_count(ng,i,ngcount,marg_count,current_pos); ngprob = (double) discounted_ngcount / marg_count; if (ngprob > 1.0) { fprintf(stderr, "discounted_ngcount = %f marg_count = %d %d %d %d\n", discounted_ngcount,marg_count,current_pos[0], current_pos[1],current_pos[2]); quit(-1,"Error : probablity of ngram is greater than one.\n"); } if (ngprob > 0.0) log_10_ngprob = log10(ngprob); else log_10_ngprob = BAD_LOG_PROB; if (i <= ng->n-2) { ngalpha = ng_double_alpha(ng, i, current_pos[i]); if (ngalpha > 0.0) log_10_ngalpha = log10(ngalpha); else log_10_ngalpha = BAD_LOG_PROB; } // BEGIN HLW VERSION if(((strstr (ng->vocab[current_pos[0]],"</s>")) == NULL)&&((i <= 1) || ((i > 1) && ((strstr (ng->vocab[(unsigned int) ng->word_id[i][current_pos[i]]],"<s>")) == NULL)))) { // if the overall entry is a trigram and it's going to end with <s>, skip it -- HLW fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob); fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]); for (j=1;j<=i;j++){ fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]); } if (i <= ng->n-2){ fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha); } else{ fprintf(ng->arpa_fp,"\n"); } } else { // something is being skipped -- HLW if(i==0) { skipped_unigrams++; } else if(i==1) { skipped_bigrams++; } else if (i==2) { skipped_trigrams++; } } // END HLW VERSION // PREVIOUS VERSION: /* if (i <= ng->n-2) { ngalpha = ng_double_alpha(ng, i, current_pos[i]); if (ngalpha > 0.0) log_10_ngalpha = log10(ngalpha); else log_10_ngalpha = BAD_LOG_PROB; } fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob); fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]); for (j=1;j<=i;j++){ // fprintf(stderr, "j %d, ng->wordid[j] %u, current_pos[j] %d, ng->word_id[j][current_pos[j]] %u\n",j, ng->word_id[j], current_pos[j], ng->word_id[j][current_pos[j]]); fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]); } if (i <= ng->n-2) fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha); else fprintf(ng->arpa_fp,"\n"); */ current_pos[i]++; }else { current_table--; if (current_table > 0) current_pos[current_table]++; } }else { if (current_pos[current_table] <= end_pos[current_table]) { current_table++; if (current_pos[current_table-1] == (int) ng->num_kgrams[current_table-1]-1) end_pos[current_table] = (int) ng->num_kgrams[current_table]-1; else { end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1], ng->ptr_table[current_table-1], ng->ptr_table_size[current_table-1], current_pos[current_table-1]+1) - 1; } }else { current_table--; if (current_table > 0) current_pos[current_table]++; } } } } } } free(current_pos); free(end_pos); fprintf(ng->arpa_fp,"\n\\end\\\n"); rr_oclose(ng->arpa_fp); // BEGIN HLW ADDITION // Now that the file is complete, let's go back and replace the placeholder ngram counts with the real final counts -- HLW final_ngram_count_replacement(ng->n,ng); unigram_count = 0; bigram_count = 0; trigram_count = 0; skipped_unigrams = 0; skipped_bigrams = 0; skipped_trigrams = 0; // END HLW ADDITION }