/* write ngram in either ascii or binary */ void write_ngram( FILE *id_ngram_fp, ngram *ng, flag ascii ) { int i; if ( ascii ) { for( i = 0; i < n; i++ ) { if ( fprintf( stdout, "%d ", ng->id_array[i] ) < 0 ) quit( -1, "error writing ascii ngram\n" ); } if ( fprintf( stdout, "%d\n", ng->count ) < 0 ) quit( -1, "error writing ascii ngram\n" ); }else { for ( i = 0; i < n; i++ ) { rr_fwrite((char*) &ng->id_array[i], sizeof( id__t ), 1, id_ngram_fp, "binary ngram" ); } rr_fwrite( (char*) &ng->count, sizeof( int ), 1, id_ngram_fp, "binary ngram" ); } }
// Returns guest instr count (in old replay counting mode) static RR_prog_point copy_entry(void) { // Code copied from rr_log.c. // Copy entry. RR_log_entry item; rr_fread(&item.header.prog_point, sizeof(item.header.prog_point), 1, oldlog); if (item.header.prog_point.guest_instr_count > end_count) { // We don't want to copy this one. return item.header.prog_point; } //ph Fix up instruction count RR_prog_point original_prog_point = item.header.prog_point; item.header.prog_point.guest_instr_count -= actual_start_count; rr_fwrite(&item.header.prog_point, sizeof(item.header.prog_point), 1, newlog); #define RR_COPY_ITEM(field) rr_fcopy(&(field), sizeof(field), 1, oldlog, newlog) RR_COPY_ITEM(item.header.kind); RR_COPY_ITEM(item.header.callsite_loc); //mz read the rest of the item switch (item.header.kind) { case RR_INPUT_1: RR_COPY_ITEM(item.variant.input_1); break; case RR_INPUT_2: RR_COPY_ITEM(item.variant.input_2); break; case RR_INPUT_4: RR_COPY_ITEM(item.variant.input_4); break; case RR_INPUT_8: RR_COPY_ITEM(item.variant.input_8); break; case RR_INTERRUPT_REQUEST: RR_COPY_ITEM(item.variant.interrupt_request); break; case RR_EXIT_REQUEST: RR_COPY_ITEM(item.variant.exit_request); break; case RR_SKIPPED_CALL: { RR_skipped_call_args *args = &item.variant.call_args; //mz read kind first! RR_COPY_ITEM(args->kind); switch(args->kind) { case RR_CALL_CPU_MEM_RW: RR_COPY_ITEM(args->variant.cpu_mem_rw_args); args->variant.cpu_mem_rw_args.buf = g_malloc(args->variant.cpu_mem_rw_args.len); rr_fcopy(args->variant.cpu_mem_rw_args.buf, 1, args->variant.cpu_mem_rw_args.len, oldlog, newlog); break; case RR_CALL_CPU_MEM_UNMAP: RR_COPY_ITEM(args->variant.cpu_mem_unmap); args->variant.cpu_mem_unmap.buf = g_malloc(args->variant.cpu_mem_unmap.len); rr_fcopy(args->variant.cpu_mem_unmap.buf, 1, args->variant.cpu_mem_unmap.len, oldlog, newlog); break; case RR_CALL_MEM_REGION_CHANGE: RR_COPY_ITEM(args->variant.mem_region_change_args); args->variant.mem_region_change_args.name = g_malloc0(args->variant.mem_region_change_args.len + 1); rr_fcopy(args->variant.mem_region_change_args.name, 1, args->variant.mem_region_change_args.len, oldlog, newlog); break; case RR_CALL_HD_TRANSFER: RR_COPY_ITEM(args->variant.hd_transfer_args); break; case RR_CALL_NET_TRANSFER: RR_COPY_ITEM(args->variant.net_transfer_args); break; case RR_CALL_HANDLE_PACKET: RR_COPY_ITEM(args->variant.handle_packet_args); args->variant.handle_packet_args.buf = g_malloc(args->variant.handle_packet_args.size); rr_fcopy(args->variant.handle_packet_args.buf, args->variant.handle_packet_args.size, 1, oldlog, newlog); break; default: //mz unimplemented sassert(0, 3); } } break; case RR_LAST: //mz nothing to read //ph We don't copy RR_LAST here; write out afterwards. break; default: //mz unimplemented sassert(0, 4); } return original_prog_point; }
static INLINEIT void rr_fcopy(void *ptr, size_t size, size_t nmemb, FILE *oldlog, FILE *newlog) { rr_fread(ptr, size, nmemb, oldlog); rr_fwrite(ptr, size, nmemb, newlog); }
void main(int argc, char *argv[]) { int i,j; char *vocab_filename; FILE *tempfile; char tempfiles_directory[1000]; int vocab_size; FILE *vocab_file; int verbosity; int buffer_size; int position_in_buffer; int number_of_tempfiles; int max_files; int fof_size; unsigned short *buffer; unsigned short *placeholder; unsigned short *temp_ngram; int temp_count; char temp_word[500]; char temp_word2[500]; char *temp_file_root; char *temp_file_ext; char *host_name; int proc_id; struct utsname uname_info; flag write_ascii; /* Vocab hash table things */ struct hash_table vocabulary; unsigned long hash_size; unsigned long M; tempfile = NULL; /* Just to prevent compilation warnings. */ report_version(&argc,argv); verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY); /* Process command line */ if (pc_flagarg( &argc, argv,"-help") || argc==1) { fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n"); fprintf(stderr,"Usage : text2idngram -vocab .vocab \n"); fprintf(stderr," [ -buffer 100 ]\n"); fprintf(stderr," [ -hash %d ]\n",DEFAULT_HASH_SIZE); fprintf(stderr," [ -temp %s ]\n",DEFAULT_TEMP); fprintf(stderr," [ -files %d ]\n",DEFAULT_MAX_FILES); fprintf(stderr," [ -gzip | -compress ]\n"); fprintf(stderr," [ -verbosity %d ]\n", DEFAULT_VERBOSITY); fprintf(stderr," [ -n 3 ]\n"); fprintf(stderr," [ -write_ascii ]\n"); fprintf(stderr," [ -fof_size 10 ]\n"); exit(1); } pc_message(verbosity,2,"text2idngram\n"); n = pc_intarg( &argc, argv, "-n",DEFAULT_N); placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n); temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n); hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE); buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM); write_ascii = pc_flagarg(&argc,argv,"-write_ascii"); fof_size = pc_intarg(&argc,argv,"-fof_size",10); max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES); vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" )); if (!strcmp("",vocab_filename)) { quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n"); } strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp", DEFAULT_TEMP)); if (pc_flagarg(&argc,argv,"-compress")) { temp_file_ext = salloc(".Z"); } else { if (pc_flagarg(&argc,argv,"-gzip")) { temp_file_ext = salloc(".gz"); } else { temp_file_ext = salloc(""); } } uname(&uname_info); host_name = salloc(uname_info.nodename); proc_id = getpid(); sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id); temp_file_root = salloc(temp_word); pc_report_unk_args(&argc,argv,verbosity); /* If the last charactor in the directory name isn't a / then add one. */ if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') { strcat(tempfiles_directory,"/"); } pc_message(verbosity,2,"Vocab : %s\n",vocab_filename); pc_message(verbosity,2,"N-gram buffer size : %d\n",buffer_size); pc_message(verbosity,2,"Hash table size : %d\n",hash_size); pc_message(verbosity,2,"Temp directory : %s\n",tempfiles_directory); pc_message(verbosity,2,"Max open files : %d\n",max_files); pc_message(verbosity,2,"FOF size : %d\n",fof_size); pc_message(verbosity,2,"n : %d\n",n); buffer_size *= (1000000/(sizeof(unsigned short)*n)); /* Allocate memory for hash table */ fprintf(stderr,"Initialising hash table...\n"); M = nearest_prime(hash_size); new_hashtable(&vocabulary,M); /* Read in the vocabulary */ vocab_size = 0; vocab_file = rr_iopen(vocab_filename); pc_message(verbosity,2,"Reading vocabulary...\n"); while (fgets (temp_word, sizeof(temp_word),vocab_file)) { if (strncmp(temp_word,"##",2)==0) continue; sscanf (temp_word, "%s ",temp_word2); /* Check for repeated words in the vocabulary */ if (index2(&vocabulary,temp_word2) != 0) { fprintf(stderr,"======================================================\n"); fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word); fprintf(stderr,"=======================================================\n"); } if (strncmp(temp_word,"#",1)==0) { fprintf(stderr,"\n\n===========================================================\n"); fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n"); fprintf(stderr, ">>> %s <<<\n",temp_word); fprintf(stderr, " '%s' will be included in the vocabulary.\n",temp_word2); fprintf(stderr, " (comments must start with '##')\n"); fprintf(stderr,"===========================================================\n\n"); } vocab_size++; add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size); } if (vocab_size > MAX_VOCAB_SIZE) { quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n"); } pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n"); buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short)); number_of_tempfiles = 0; /* Read text into buffer */ /* Read in the first ngram */ position_in_buffer = 0; for (i=0;i<=n-1;i++) { get_word(stdin,temp_word); add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer); } while (!rr_feof(stdin)) { /* Fill up the buffer */ pc_message(verbosity,2,"Reading text into the n-gram buffer...\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) { position_in_buffer++; if (position_in_buffer % 20000 == 0) { if (position_in_buffer % 1000000 == 0) { pc_message(verbosity,2,".\n"); } else { pc_message(verbosity,2,"."); } } for (i=1;i<=n-1;i++) { add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer), position_in_buffer,i-1,buffer); } if (get_word(stdin,temp_word) == 1) { add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer, n-1,buffer); } } for (i=0;i<=n-1;i++) { placeholder[i] = buffer_contents(position_in_buffer,i,buffer); } /* Sort buffer */ pc_message(verbosity,2,"\nSorting n-grams...\n"); qsort((void*) buffer,(size_t) position_in_buffer, n*sizeof(unsigned short),compare_ngrams); /* Output the buffer to temporary BINARY file */ number_of_tempfiles++; sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n", temp_word); tempfile = rr_oopen(temp_word); for (i=0;i<=n-1;i++) { temp_ngram[i] = buffer_contents(0,i,buffer); if (temp_ngram[i] > MAX_VOCAB_SIZE) { quit(-1,"Invalid trigram in buffer.\nAborting"); } } temp_count = 1; for (i=1;i<=position_in_buffer;i++) { if (!compare_ngrams(temp_ngram,&buffer[i*n])) { temp_count++; } else { for (j=0;j<=n-1;j++) { rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1, tempfile,"temporary n-gram ids"); temp_ngram[j] = buffer_contents(i,j,buffer); } rr_fwrite(&temp_count,sizeof(int),1,tempfile, "temporary n-gram counts"); temp_count = 1; } } rr_oclose(tempfile); for (i=0;i<=n-1;i++) { add_to_buffer(placeholder[i],0,i,buffer); } position_in_buffer = 0; } /* Merge the temporary files, and output the result to standard output */ pc_message(verbosity,2,"Merging temporary files...\n"); merge_tempfiles(1, number_of_tempfiles, temp_file_root, temp_file_ext, max_files, tempfiles_directory, stdout, write_ascii, fof_size); pc_message(verbosity,0,"text2idngram : Done.\n"); exit(0); }
int main(int argc, char *argv[]) { int verbosity; int vocab_size; FILE *vocab_file; int buffer_size; flag write_ascii; int max_files; int number_of_tempfiles; char *vocab_filename; char *idngram_filename; char temp_word[MAX_WORD_LENGTH]; char temp_word2[MAX_WORD_LENGTH]; char temp_word3[MAX_WORD_LENGTH]; flag contains_unks; int position_in_buffer; FILE *outfile; FILE *tempfile; FILE *non_unk_fp; ngram_rec *buffer; flag same_ngram; int i; int j; int fof_size; int size_of_rec; char temp_directory[1000]; char *temp_file_ext; /* Vocab hash table things */ struct idngram_hash_table vocabulary; unsigned long hash_size; unsigned long M; wordid_t *current_ngram; int current_count; wordid_t *sort_ngram; int sort_count; /* Process command line */ report_version(&argc,argv); if (argc == 1 || pc_flagarg(&argc, argv,"-help")) { /* Display help message */ help_message(); exit(1); } n = pc_intarg( &argc, argv, "-n",DEFAULT_N); hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE); buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM); write_ascii = pc_flagarg(&argc,argv,"-write_ascii"); verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY); max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES); fof_size = pc_intarg(&argc,argv,"-fof_size",10); vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" )); idngram_filename = salloc(pc_stringarg( &argc, argv, "-idngram", "" )); if (!strcmp("",vocab_filename)) quit(-1,"Error : Must specify a vocabulary file.\n"); if (!strcmp("",idngram_filename)) quit(-1,"text2idngram : Error : Must specify idngram file.\n"); if (pc_flagarg(&argc,argv,"-compress")) temp_file_ext = salloc(".Z"); else { if (pc_flagarg(&argc,argv,"-gzip")) temp_file_ext = salloc(".gz"); else temp_file_ext = salloc(""); } strcpy(temp_directory, "cmuclmtk-XXXXXX"); if (mkdtemp(temp_directory) == NULL) { quit(-1, "Failed to create temporary folder: %s\n", strerror(errno)); } pc_report_unk_args(&argc,argv,verbosity); outfile = rr_fopen(idngram_filename,"wb"); pc_message(verbosity,2,"Vocab : %s\n",vocab_filename); pc_message(verbosity,2,"Output idngram : %s\n",idngram_filename); pc_message(verbosity,2,"Buffer size : %d\n",buffer_size); pc_message(verbosity,2,"Hash table size : %d\n",hash_size); pc_message(verbosity,2,"Max open files : %d\n",max_files); pc_message(verbosity,2,"n : %d\n",n); pc_message(verbosity,2,"FOF size : %d\n",fof_size); size_of_rec = (sizeof(wordid_t) * n) + 16 - (( n* sizeof(wordid_t)) % 16); buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec))); fprintf(stderr,"buffer size = %d\n",buffer_size); /* Allocate memory for hash table */ fprintf(stderr,"Initialising hash table...\n"); M = nearest_prime(hash_size); new_idngram_hashtable(&vocabulary,M); /* Read in the vocabulary */ vocab_size = 0; vocab_file = rr_iopen(vocab_filename); pc_message(verbosity,2,"Reading vocabulary...\n"); while (fgets (temp_word, sizeof(temp_word),vocab_file)) { if (strncmp(temp_word,"##",2)==0) continue; sscanf (temp_word, "%s ",temp_word2); /* Check for vocabulary order */ if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n"); /* Check for repeated words in the vocabulary */ if (index2(&vocabulary,temp_word2) != 0) warn_on_repeated_words(temp_word); warn_on_wrong_vocab_comments(temp_word); vocab_size++; add_to_idngram_hashtable(&vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size); strcpy(temp_word3,temp_word2); } if (vocab_size > MAX_VOCAB_SIZE) quit(-1,"Error : Vocabulary size exceeds maximum.\n"); pc_message(verbosity,2,"Allocating memory for the buffer...\n"); buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec)); for (i=0;i<=buffer_size;i++) buffer[i].word = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); /* Open the "non-OOV" tempfile */ sprintf(temp_word, "%s/1%s", temp_directory, temp_file_ext); non_unk_fp = rr_fopen(temp_word,"w"); pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n", temp_word); number_of_tempfiles = 1; current_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); sort_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); /* Read text into buffer */ position_in_buffer = 0; while (!rr_feof(stdin)) { for (i=0;i<=n-1;i++) { get_word(stdin,temp_word); current_ngram[i]=index2(&vocabulary,temp_word); } if (scanf("%d",¤t_count) != 1) if (!rr_feof(stdin)) quit(-1,"Error reading n-gram count from stdin.\n"); if (!rr_feof(stdin)) { contains_unks = 0; for (i=0;i<=n-1;i++) { if (!current_ngram[i]) contains_unks = 1; } if (contains_unks) { /* Write to buffer */ position_in_buffer++; if (position_in_buffer >= buffer_size) { /* Sort buffer */ pc_message(verbosity,2, "Sorting n-grams which include an OOV word...\n"); qsort((void*) buffer,(size_t) position_in_buffer, sizeof(ngram_rec),compare_ngrams2); pc_message(verbosity,2,"Done.\n"); /* Write buffer to temporary file */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s", temp_directory, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2, "Writing sorted OOV-counts buffer to temporary file %s\n", temp_word); tempfile = rr_fopen(temp_word,"w"); for (i=0;i<=n-1;i++) sort_ngram[i] = buffer[0].word[i]; sort_count = buffer[0].count; for (i=0;i<=position_in_buffer-2;i++) { same_ngram = 1; for (j=n-1;j>=0;j--) { if (buffer[i].word[j] != sort_ngram[j]) { same_ngram = 0; j = -1; } } if (same_ngram) sort_count += buffer[i].count; else { for (j=0;j<=n-1;j++) { rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); sort_ngram[j] = buffer[i].word[j]; } rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); sort_count = buffer[i].count; } } for (j=0;j<=n-1;j++) rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); rr_oclose(tempfile); position_in_buffer = 1; } for (i=0;i<=n-1;i++) buffer[position_in_buffer-1].word[i] = current_ngram[i]; buffer[position_in_buffer-1].count = current_count; }else { /* Write to temporary file */ for (i=0;i<=n-1;i++) rr_fwrite((char*)¤t_ngram[i],sizeof(wordid_t),1, non_unk_fp,"temporary n-gram ids"); rr_fwrite((char*)¤t_count,sizeof(int),1,non_unk_fp, "temporary n-gram counts"); } } } if (position_in_buffer > 0) { /* Only do this bit if we have actually seen some OOVs */ /* Sort final buffer */ pc_message(verbosity,2,"Sorting final buffer...\n"); qsort((void*) buffer,(size_t) position_in_buffer, sizeof(ngram_rec),compare_ngrams2); /* Write final buffer */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s", temp_directory, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2,"Writing sorted buffer to temporary file %s\n", temp_word); tempfile = rr_fopen(temp_word,"w"); for (i=0;i<=n-1;i++) sort_ngram[i] = buffer[0].word[i]; sort_count = buffer[0].count; for (i=1;i<=position_in_buffer-1;i++) { same_ngram = 1; for (j=n-1;j>=0;j--) { if (buffer[i].word[j] != sort_ngram[j]) { same_ngram = 0; j = -1; } } if (same_ngram) sort_count += buffer[i].count; else { for (j=0;j<=n-1;j++) { rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); sort_ngram[j] = buffer[i].word[j]; } rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); sort_count = buffer[i].count; } } for (j=0;j<=n-1;j++) rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); fclose(tempfile); } /* Merge the temporary files, and output the result */ fclose(non_unk_fp); pc_message(verbosity,2,"Merging temporary files...\n"); merge_idngramfiles(1, number_of_tempfiles, temp_directory, temp_file_ext, max_files, outfile, write_ascii, fof_size, n); fclose(outfile); rmdir(temp_directory); pc_message(verbosity,0,"wngram2idngram : Done.\n"); return 0; }
void main(int argc, char *argv[]) { int verbosity; int n; int m; int i; int input_type; int storage_type; unsigned short *current_ngram_int; unsigned short *previous_ngram_int; char **current_ngram_text; char **previous_ngram_text; int current_count; int running_total; flag same; flag first_one; flag got_to_eof; running_total = 0; report_version(&argc,argv); if (pc_flagarg( &argc, argv,"-help") || argc==1) { fprintf(stderr,"ngram2mgram - Convert an n-gram file to an m-gram file, where m<n\n"); fprintf(stderr,"Usage : ngram2mgram -n N -m M\n"); fprintf(stderr," [ -binary | -ascii | -words ]\n"); fprintf(stderr," < .ngram > .mgram\n"); exit(1); } n = pc_intarg( &argc, argv,"-n",0); m = pc_intarg( &argc, argv,"-m",0); verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY); input_type = 0; if (pc_flagarg( &argc, argv,"-binary")) { input_type = BINARY; } if (pc_flagarg( &argc, argv,"-ascii")) { if (input_type != 0) { quit(-1,"Error : more than one file format specified.\n"); } input_type = ASCII; } if (pc_flagarg( &argc, argv,"-words")) { if (input_type != 0) { quit(-1,"Error : more than one file format specified.\n"); } input_type = WORDS; } if (input_type == 0) { pc_message(verbosity,2,"Warning : no input type specified. Defaulting to binary.\n"); input_type = BINARY; } if (n == 0) { quit(-1,"Must specify a value for n. Use the -n switch.\n"); } if (m == 0) { quit(-1,"Must specify a value for m. Use the -m switch.\n"); } if (n<=m) { quit(-1,"n must be greater than m.\n"); } pc_report_unk_args(&argc,argv,verbosity); if (input_type == BINARY || input_type == ASCII) { storage_type = NUMERIC; } else { storage_type = ALPHA; } if (storage_type == NUMERIC) { current_ngram_int = (unsigned short *) rr_malloc(n*sizeof(unsigned short)); previous_ngram_int = (unsigned short *) rr_malloc(n*sizeof(unsigned short)); /* And to prevent compiler warnings ... */ current_ngram_text = NULL; previous_ngram_text = NULL; } else { current_ngram_text = (char **) rr_malloc(n*sizeof(char *)); previous_ngram_text = (char **) rr_malloc(n*sizeof(char *)); for (i=0;i<=n-1;i++) { current_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char)); previous_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char)); } /* And to prevent compiler warnings ... */ current_ngram_int = NULL; previous_ngram_int = NULL; } got_to_eof = 0; first_one = 1; while (!rr_feof(stdin)) { /* Store previous n-gram */ if (!first_one) { if (storage_type == NUMERIC) { for (i=0;i<=n-1;i++) { previous_ngram_int[i] = current_ngram_int[i]; } } else { for (i=0;i<=n-1;i++) { strcpy(previous_ngram_text[i],current_ngram_text[i]); } } } /* Read new n-gram */ switch(input_type) { case BINARY: for (i=0;i<=n-1;i++) { rr_fread(¤t_ngram_int[i],sizeof(id__t),1,stdin, "from id_ngrams at stdin",0); } rr_fread(¤t_count,sizeof(count_t),1,stdin, "from id_ngrams file at stdin",0); break; case ASCII: for (i=0;i<=n-1;i++) { if (fscanf(stdin,"%hu",¤t_ngram_int[i]) != 1) { if (!rr_feof(stdin)) { quit(-1,"Error reading id_ngram.\n"); } else { got_to_eof = 1; } } } if (fscanf(stdin,"%d",¤t_count) != 1) { if (!rr_feof(stdin)) { quit(-1,"Error reading id_ngram.\n"); } else { got_to_eof = 1; } } break; case WORDS: for (i=0;i<=n-1;i++) { if (fscanf(stdin,"%s",current_ngram_text[i]) != 1) { if (!rr_feof(stdin)) { quit(-1,"Error reading id_ngram.\n"); } else { got_to_eof = 1; } } } if (fscanf(stdin,"%d",¤t_count) != 1) { if (!rr_feof(stdin)) { quit(-1,"Error reading id_ngram.\n"); } else { got_to_eof = 1; } } break; } if (!got_to_eof) { /* Check for correct sorting */ if (!first_one) { switch(storage_type) { case NUMERIC: for (i=0;i<=n-1;i++) { if (current_ngram_int[i]<previous_ngram_int[i]) { quit(-1,"Error : ngrams not correctly sorted.\n"); } else { if (current_ngram_int[i]>previous_ngram_int[i]) { i=n; } } } break; case ALPHA: for (i=0;i<=n-1;i++) { if (strcmp(current_ngram_text[i],previous_ngram_text[i])<0) { quit(-1,"Error : ngrams not correctly sorted.\n"); } else { if (strcmp(current_ngram_text[i],previous_ngram_text[i])>0) { i=n; } } } break; } } /* Compare this m-gram with previous m-gram */ if (!first_one) { switch(storage_type) { case NUMERIC: same = 1; for (i=0;i<=m-1;i++) { if (current_ngram_int[i] != previous_ngram_int[i]) { same = 0; } } if (same) { running_total += current_count; } else { if (input_type == ASCII) { for (i=0;i<=m-1;i++) { printf("%d ",previous_ngram_int[i]); } printf("%d\n",running_total); } else { for (i=0;i<=m-1;i++) { rr_fwrite(&previous_ngram_int[i],sizeof(id__t),1,stdout, "to id_ngrams at stdout"); } rr_fwrite(&running_total,sizeof(count_t),1,stdout, "to id n-grams at stdout"); } running_total = current_count; } break; case ALPHA: same = 1; for (i=0;i<=m-1;i++) { if (strcmp(current_ngram_text[i],previous_ngram_text[i])) { same = 0; } } if (same) { running_total += current_count; } else { for (i=0;i<=m-1;i++) { printf("%s ",previous_ngram_text[i]); } printf("%d\n",running_total); running_total = current_count; } break; } } else { running_total = current_count; } first_one = 0; } } /* Write out final m-gram */ switch(input_type) { case BINARY: break; case ASCII: for (i=0;i<=m-1;i++) { printf("%d ",previous_ngram_int[i]); } printf("%d\n",running_total); break; case WORDS: for (i=0;i<=m-1;i++) { printf("%s ",previous_ngram_text[i]); } printf("%d\n",running_total); break; } pc_message(verbosity,0,"ngram2mgram : Done.\n"); exit(0); }
void merge_idngramfiles (int start_file, int end_file, char *temp_file_root, char *temp_file_ext, int max_files, FILE *outfile, flag write_ascii, int fof_size, int n_order) { FILE *new_temp_file; char temp_string[1000]; char *new_temp_filename; FILE **temp_file; char **temp_filename; wordid_t **current_ngram; wordid_t *smallest_ngram; wordid_t *previous_ngram; int *current_ngram_count; flag *finished; flag all_finished; int temp_count; int i,j; flag first_ngram; fof_t **fof_array; ngram_sz_t *num_kgrams; int *ng_count; int pos_of_novelty; n = n_order; pos_of_novelty = n; /* Simply for warning-free compilation */ num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t)); ng_count = (int *) rr_calloc(n-1,sizeof(int)); first_ngram = 1; previous_ngram = (wordid_t *) rr_calloc(n,sizeof(wordid_t)); temp_file = (FILE **) rr_malloc(sizeof(FILE *) * (end_file-start_file+1)); temp_filename = (char **) rr_malloc(sizeof(char *) * (end_file-start_file+1)); /* should change to 2d array*/ current_ngram = (wordid_t **) rr_malloc(sizeof(wordid_t *) * (end_file-start_file+1)); for (i=0;i<=end_file-start_file;i++) current_ngram[i] = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); current_ngram_count = (int *) rr_malloc(sizeof(int)*(end_file-start_file+1)); finished = (flag *) rr_malloc(sizeof(flag)*(end_file-start_file+1)); smallest_ngram = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); /* should change to 2d array*/ fof_array = (fof_t **) rr_malloc(sizeof(fof_t *)*(n-1)); for (i=0;i<=n-2;i++) fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t)); if (end_file-start_file+1 > max_files) { sprintf(temp_string,"%s/%hu%s",temp_file_root, end_file+1,temp_file_ext); new_temp_filename = salloc(temp_string); new_temp_file = rr_oopen(new_temp_filename); merge_tempfiles(start_file,start_file+max_files-1, temp_file_root,temp_file_ext,max_files, new_temp_file,write_ascii,0); merge_tempfiles(start_file+max_files,end_file+1, temp_file_root,temp_file_ext,max_files, outfile,write_ascii,0); }else { /* Open all the temp files for reading */ for (i=0;i<=end_file-start_file;i++) { sprintf(temp_string,"%s/%hu%s",temp_file_root, i+start_file,temp_file_ext); temp_filename[i] = salloc(temp_string); temp_file[i] = rr_iopen(temp_filename[i]); } /* Now go through the files simultaneously, and write out the appropriate ngram counts to the output file. */ for (i=end_file-start_file;i>=0;i--) { finished[i] = 0; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { rr_fread((char*) ¤t_ngram[i][j], sizeof(wordid_t),1, temp_file[i],"temporary n-gram ids",0); } rr_fread((char*) ¤t_ngram_count[i], sizeof(int),1, temp_file[i],"temporary n-gram counts",0); } } all_finished = 0; while (!all_finished) { /* Find the smallest current ngram */ for (i=0;i<=n-1;i++) smallest_ngram[i] = MAX_WORDID; for (i=0;i<=end_file-start_file;i++) { if (!finished[i]) { if (compare_ngrams3(smallest_ngram,current_ngram[i]) < 0) { for (j=0;j<n;j++) smallest_ngram[j] = current_ngram[i][j]; } } } #if MAX_VOCAB_SIZE < 65535 /* This check is well-meaning but completely useless since smallest_ngram[i] by definition cannot contain any value greater than MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */ for (i=0;i<=n-1;i++) { if (smallest_ngram[i] > MAX_VOCAB_SIZE) { quit(-1,"Error : Temporary files corrupted, invalid n-gram found.\n"); } } #endif /* For each of the files that are currently holding this ngram, add its count to the temporary count, and read in a new ngram from the files. */ temp_count = 0; for (i=0;i<=end_file-start_file;i++) { if (!finished[i]) { if (compare_ngrams3(smallest_ngram,current_ngram[i]) == 0) { temp_count = temp_count + current_ngram_count[i]; if (!rr_feof(temp_file[i])) { for (j=0;j<=n-1;j++) { rr_fread((char*) ¤t_ngram[i][j],sizeof(wordid_t),1, temp_file[i],"temporary n-gram ids",0); } rr_fread((char*)¤t_ngram_count[i],sizeof(int),1, temp_file[i],"temporary n-gram count",0); }else { finished[i] = 1; all_finished = 1; for (j=0;j<=end_file-start_file;j++) { if (!finished[j]) all_finished = 0; } } } } } if (write_ascii) { for (i=0;i<=n-1;i++) { if (fprintf(outfile,"%d ",smallest_ngram[i]) < 0) { quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n"); } } if (fprintf(outfile,"%d\n",temp_count) < 0) quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n"); }else { for (i=0;i<=n-1;i++) { rr_fwrite((char*)&smallest_ngram[i],sizeof(wordid_t),1, outfile,"n-gram ids"); } rr_fwrite((char*)&temp_count,sizeof(count_t),1,outfile,"n-gram counts"); } if (fof_size > 0 && n>1) { /* Add stuff to fof arrays */ /* Code from idngram2stats */ pos_of_novelty = n; for (i=0;i<=n-1;i++) { if (smallest_ngram[i] > previous_ngram[i]) { pos_of_novelty = i; i=n; } } /* Add new N-gram */ num_kgrams[n-2]++; if (temp_count <= fof_size) fof_array[n-2][temp_count]++; if (!first_ngram) { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) { fof_array[i-1][ng_count[i-1]]++; } ng_count[i-1] = temp_count; } }else { for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { ng_count[i-1] = temp_count; } first_ngram = 0; } for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += temp_count; for (i=0;i<=n-1;i++) previous_ngram[i]=smallest_ngram[i]; } } for (i=0;i<=end_file-start_file;i++) { fclose(temp_file[i]); remove(temp_filename[i]); } } if (fof_size > 0 && n>1) { /* Display fof arrays */ /* Process last ngram */ for (i=n-2;i>=MAX(1,pos_of_novelty);i--) { num_kgrams[i-1]++; if (ng_count[i-1] <= fof_size) fof_array[i-1][ng_count[i-1]]++; ng_count[i-1] = temp_count; } for (i=0;i<=pos_of_novelty-2;i++) ng_count[i] += temp_count; display_fof_array(num_kgrams,fof_array,fof_size,stderr, n); } }
/* @return number_of_tempfiles */ int read_txt2ngram_buffer(FILE* infp, struct idngram_hash_table *vocabulary, int32 verbosity, wordid_t *buffer, int buffer_size, unsigned int n, char* temp_file_root, char* temp_file_ext, FILE* temp_file ) { /* Read text into buffer */ char temp_word[MAX_WORD_LENGTH]; int position_in_buffer; int number_of_tempfiles; unsigned int i,j; wordid_t *placeholder; wordid_t *temp_ngram; int temp_count; #if 1 int tmpval; #endif temp_ngram = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); placeholder = (wordid_t *) rr_malloc(sizeof(wordid_t)*n); ng=n; position_in_buffer = 0; number_of_tempfiles = 0; //tk: looks like things may croak if the corpus has less than n words //not that such a corpus would be useful anyway for (i=0;i<=n-1;i++) { get_word(infp,temp_word); /* fprintf(stderr,"%s \n",temp_word); fprintf(stderr,"%d \n",index2(vocabulary,temp_word)); fflush(stderr); */ add_to_buffer(index2(vocabulary,temp_word),0,i,buffer); } while (!rr_feof(infp)) { /* Fill up the buffer */ pc_message(verbosity,2,"Reading text into the n-gram buffer...\n"); pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n"); while ((position_in_buffer<buffer_size) && (!rr_feof(infp))) { position_in_buffer++; show_idngram_nlines(position_in_buffer,verbosity); for (i=1;i<=n-1;i++) add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer), position_in_buffer,i-1,buffer); if (get_word(infp,temp_word) == 1) { /* fprintf(stderr,"%s \n",temp_word); fprintf(stderr,"%d \n",index2(vocabulary,temp_word)); fflush(stderr); */ add_to_buffer(index2(vocabulary,temp_word),position_in_buffer, n-1,buffer); } } for (i=0;i<=n-1;i++) placeholder[i] = buffer_contents(position_in_buffer,i,buffer); /* Sort buffer */ pc_message(verbosity,2,"\nSorting n-grams...\n"); qsort((void*) buffer,(size_t) position_in_buffer,n*sizeof(wordid_t),compare_ngrams); /* Output the buffer to temporary BINARY file */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s",temp_file_root, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n", temp_word); temp_file = rr_oopen(temp_word); for (i=0;i<=n-1;i++) { temp_ngram[i] = buffer_contents(0,i,buffer); #if MAX_VOCAB_SIZE < 65535 /* This check is well-meaning but completely useless since buffer_contents() can never return something greater than MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */ if (temp_ngram[i] > MAX_VOCAB_SIZE) quit(-1,"Invalid trigram in buffer.\nAborting"); #endif } temp_count = 1; for (i=1;i<=position_in_buffer;i++) { tmpval=compare_ngrams(temp_ngram,&buffer[i*n]); /* for(k=0;k<=n-1;k++){ fprintf(stderr, "tmpval: %d k %d, temp_ngram %d, &buffer[i*n] %d\n",tmpval, k, temp_ngram[k], (&buffer[i*n])[k]); }*/ if (!compare_ngrams(temp_ngram,&buffer[i*n])) temp_count++; else { /* printf("Have been here?\n");*/ for (j=0;j<=n-1;j++) { rr_fwrite((char*) &temp_ngram[j],sizeof(wordid_t),1, temp_file,"temporary n-gram ids"); temp_ngram[j] = buffer_contents(i,j,buffer); } rr_fwrite((char*)&temp_count,sizeof(int),1,temp_file, "temporary n-gram counts"); /* for(j=0 ; j<=n-1;j++) fprintf(stderr,"%d ",temp_ngram[j]); fprintf(stderr,"%d\n",temp_count);*/ temp_count = 1; } } rr_oclose(temp_file); for (i=0;i<=n-1;i++) add_to_buffer(placeholder[i],0,i,buffer); position_in_buffer = 0; } return number_of_tempfiles; }
void write_bin_lm(ng_t *ng,int verbosity) { int l_chunk; int from_rec; int i; pc_message(verbosity,1,"Binary %d-gram language model will be written to %s\n",ng->n,ng->bin_filename); ng->version = BBO_FILE_VERSION; /* Scalar parameters */ rr_fwrite((char*)&ng->version,sizeof(int),1,ng->bin_fp,"version"); rr_fwrite((char*)&ng->n,sizeof(unsigned short),1,ng->bin_fp,"n"); rr_fwrite((char*)&ng->vocab_size,sizeof(wordid_t),1,ng->bin_fp,"vocab_size"); rr_fwrite((char*)&ng->no_of_ccs,sizeof(unsigned short),1,ng->bin_fp,"no_of_ccs"); rr_fwrite((char*)&ng->vocab_type,sizeof(unsigned short),1,ng->bin_fp,"vocab_type"); rr_fwrite((char*)&ng->count_table_size,sizeof(count_ind_t),1,ng->bin_fp,"count_table_size"); rr_fwrite((char*)&ng->discounting_method,sizeof(unsigned short),1,ng->bin_fp,"discounting_method"); rr_fwrite((char*)&ng->min_alpha,sizeof(double),1,ng->bin_fp,"min_alpha"); rr_fwrite((char*)&ng->max_alpha,sizeof(double),1,ng->bin_fp,"max_alpha"); rr_fwrite((char*)&ng->out_of_range_alphas,sizeof(unsigned short),1,ng->bin_fp,"out_of_range_alphas"); rr_fwrite((char*)&ng->size_of_alpha_array,sizeof(unsigned short),1,ng->bin_fp,"size_of_alpha_array"); rr_fwrite((char*)&ng->n_unigrams,sizeof(ngram_sz_t),1,ng->bin_fp,"n_unigrams"); rr_fwrite((char*)&ng->zeroton_fraction,sizeof(double),1,ng->bin_fp,"zeroton_fraction"); rr_fwrite((char*)&ng->oov_fraction,sizeof(double),1,ng->bin_fp,"oov_fraction"); rr_fwrite((char*)&ng->four_byte_counts,sizeof(flag),1,ng->bin_fp,"four_byte_counts"); rr_fwrite((char*)&ng->four_byte_alphas,sizeof(flag),1,ng->bin_fp,"four_byte_alphas"); rr_fwrite((char*)&ng->first_id,sizeof(unsigned short),1, ng->bin_fp,"first_id"); /* Short and shortish arrays */ sih_val_write_to_file(ng->vocab_ht,ng->bin_fp,ng->bin_filename,0); /* (ng->vocab is not stored in file - will be derived from ng->vocab_ht) */ if (ng->four_byte_counts==1) { assert(ng->marg_counts4); rr_fwrite((char*)ng->marg_counts4,sizeof(count_t), ng->vocab_size+1,ng->bin_fp,"marg_counts"); }else { assert(ng->marg_counts); rr_fwrite((char*)ng->marg_counts,sizeof(count_ind_t), ng->vocab_size+1,ng->bin_fp,"marg_counts"); } rr_fwrite((char*)ng->alpha_array,sizeof(double), ng->size_of_alpha_array,ng->bin_fp,"alpha_array"); if (!ng->four_byte_counts) { for (i=0;i<=ng->n-1;i++) rr_fwrite((char*)ng->count_table[i],sizeof(count_t), ng->count_table_size+1,ng->bin_fp,"count_table"); } /* Could write count_table as one block, but better to be safe and do it in chunks. For motivation, see comments about writing tree info. */ rr_fwrite((char*)ng->ptr_table_size,sizeof(ptr_tab_sz_t),ng->n,ng->bin_fp,"ptr_table_size"); for (i=0;i<=ng->n-1;i++) rr_fwrite((char*)ng->ptr_table[i],sizeof(ptr_tab_t),ng->ptr_table_size[i],ng->bin_fp,"ptr_table"); /* Unigram statistics */ rr_fwrite((char*)ng->uni_probs,sizeof(uni_probs_t), ng->vocab_size+1, ng->bin_fp,"uni_probs"); rr_fwrite((char*)ng->uni_log_probs,sizeof(uni_probs_t),ng->vocab_size+1, ng->bin_fp,"uni_log_probs"); rr_fwrite((char*)ng->context_cue,sizeof(flag),ng->vocab_size+1, ng->bin_fp,"context_cue"); rr_fwrite((char*)ng->cutoffs,sizeof(cutoff_t),ng->n,ng->bin_fp,"cutoffs"); switch (ng->discounting_method) { case GOOD_TURING: rr_fwrite((char*)ng->fof_size,sizeof(fof_sz_t),ng->n,ng->bin_fp,"fof_size"); rr_fwrite((char*)ng->disc_range,sizeof(unsigned short),ng->n, ng->bin_fp,"disc_range"); for (i=0;i<=ng->n-1;i++) { rr_fwrite((char*)ng->freq_of_freq[i],sizeof(fof_t), ng->fof_size[i]+1,ng->bin_fp,"freq_of_freq"); } for (i=0;i<=ng->n-1;i++) { rr_fwrite((char*)ng->gt_disc_ratio[i],sizeof(disc_val_t), ng->disc_range[i]+1,ng->bin_fp,"gt_disc_ratio"); } case WITTEN_BELL: break; case LINEAR: rr_fwrite((char*)ng->lin_disc_ratio,sizeof(disc_val_t), ng->n,ng->bin_fp,"lin_disc_ratio"); break; case ABSOLUTE: rr_fwrite((char*)ng->abs_disc_const,sizeof(double), ng->n,ng->bin_fp,"abs_disc_const"); break; } /* Tree information */ /* Unigram stuff first, since can be dumped all in one go */ rr_fwrite((char*)ng->num_kgrams,sizeof(ngram_sz_t),ng->n,ng->bin_fp,"num_kgrams"); if (ng->four_byte_counts) rr_fwrite((char*)ng->count4[0],sizeof(count_t),ng->vocab_size+1, ng->bin_fp,"unigram counts"); else rr_fwrite((char*)ng->count[0],sizeof(count_ind_t),ng->vocab_size+1, ng->bin_fp,"unigram counts"); if (ng->four_byte_alphas) rr_fwrite((char*)ng->bo_weight4[0],sizeof(four_byte_t),ng->vocab_size+1, ng->bin_fp,"unigram backoff weights"); else rr_fwrite((char*)ng->bo_weight[0],sizeof(bo_weight_t),ng->vocab_size+1, ng->bin_fp,"unigram backoff weights"); if (ng->n > 1) rr_fwrite((char*)ng->ind[0],sizeof(index__t),ng->vocab_size+1, ng->bin_fp,"unigram -> bigram pointers"); /* Write the rest of the tree structure in chunks, otherwise the kernel buffers are too big. */ /* Need to do byte swapping */ swap_struct(ng); for (i=1;i<=ng->n-1;i++) { from_rec = 0; l_chunk = 100000; while(from_rec < ng->num_kgrams[i]) { if (from_rec+l_chunk > ng->num_kgrams[i]) l_chunk = ng->num_kgrams[i] - from_rec; rr_fwrite((char*)&ng->word_id[i][from_rec],1,sizeof(id__t)*l_chunk,ng->bin_fp,"word ids"); from_rec += l_chunk; } } for (i=1;i<=ng->n-1;i++) { from_rec = 0; l_chunk = 100000; while(from_rec < ng->num_kgrams[i]) { if (from_rec+l_chunk > ng->num_kgrams[i]) l_chunk = ng->num_kgrams[i] - from_rec; if (ng->four_byte_counts) rr_fwrite((char*)&ng->count4[i][from_rec],1,sizeof(count_t)*l_chunk,ng->bin_fp,"counts"); else rr_fwrite((char*)&ng->count[i][from_rec],1,sizeof(count_ind_t)*l_chunk,ng->bin_fp,"counts"); from_rec += l_chunk; } } for (i=1;i<=ng->n-2;i++) { from_rec = 0; l_chunk = 100000; while(from_rec < ng->num_kgrams[i]) { if (from_rec+l_chunk > ng->num_kgrams[i]) l_chunk = ng->num_kgrams[i] - from_rec; if (ng->four_byte_alphas) rr_fwrite((char*)&ng->bo_weight4[i][from_rec],1,sizeof(four_byte_t)*l_chunk, ng->bin_fp,"backoff weights"); else rr_fwrite((char*)&ng->bo_weight[i][from_rec],1,sizeof(bo_weight_t)*l_chunk, ng->bin_fp,"backoff weights"); from_rec += l_chunk; } } for (i=1;i<=ng->n-2;i++) { from_rec = 0; l_chunk = 100000; while(from_rec < ng->num_kgrams[i]) { if (from_rec+l_chunk > ng->num_kgrams[i]) l_chunk = ng->num_kgrams[i] - from_rec; rr_fwrite((char*)&ng->ind[i][from_rec],1,sizeof(index__t)*l_chunk,ng->bin_fp, "indices"); from_rec += l_chunk; } } rr_oclose(ng->bin_fp); /* Swap back */ swap_struct(ng); }