Beispiel #1
0
/* write ngram in either ascii or binary */
void write_ngram( FILE *id_ngram_fp, ngram *ng, flag ascii )
{
  int i;
  
  if ( ascii ) {
    for( i = 0; i < n; i++ ) {
      if ( fprintf( stdout, "%d ", ng->id_array[i] ) < 0 ) 
	quit( -1, "error writing ascii ngram\n" );
    }
    if ( fprintf( stdout, "%d\n", ng->count ) < 0 )
      quit( -1, "error writing ascii ngram\n" );
  }else {
    for ( i = 0; i < n; i++ ) {
      rr_fwrite((char*) &ng->id_array[i], sizeof( id__t ), 1, id_ngram_fp,
		 "binary ngram" );
    }
    rr_fwrite( (char*) &ng->count, sizeof( int ), 1, id_ngram_fp,
	       "binary ngram" );
  }
}
Beispiel #2
0
// Returns guest instr count (in old replay counting mode)
static RR_prog_point copy_entry(void) {
    // Code copied from rr_log.c.
    // Copy entry.
    RR_log_entry item;

    rr_fread(&item.header.prog_point, sizeof(item.header.prog_point), 1, oldlog);
    if (item.header.prog_point.guest_instr_count > end_count) {
        // We don't want to copy this one.
        return item.header.prog_point;
    }

    //ph Fix up instruction count
    RR_prog_point original_prog_point = item.header.prog_point;
    item.header.prog_point.guest_instr_count -= actual_start_count;
    rr_fwrite(&item.header.prog_point, sizeof(item.header.prog_point), 1, newlog);

#define RR_COPY_ITEM(field) rr_fcopy(&(field), sizeof(field), 1, oldlog, newlog)
    RR_COPY_ITEM(item.header.kind);
    RR_COPY_ITEM(item.header.callsite_loc);

    //mz read the rest of the item
    switch (item.header.kind) {
        case RR_INPUT_1:
            RR_COPY_ITEM(item.variant.input_1);
            break;
        case RR_INPUT_2:
            RR_COPY_ITEM(item.variant.input_2);
            break;
        case RR_INPUT_4:
            RR_COPY_ITEM(item.variant.input_4);
            break;
        case RR_INPUT_8:
            RR_COPY_ITEM(item.variant.input_8);
            break;
        case RR_INTERRUPT_REQUEST:
            RR_COPY_ITEM(item.variant.interrupt_request);
            break;
        case RR_EXIT_REQUEST:
            RR_COPY_ITEM(item.variant.exit_request);
            break;
        case RR_SKIPPED_CALL: {
            RR_skipped_call_args *args = &item.variant.call_args;
            //mz read kind first!
            RR_COPY_ITEM(args->kind);
            switch(args->kind) {
                case RR_CALL_CPU_MEM_RW:
                    RR_COPY_ITEM(args->variant.cpu_mem_rw_args);
                    args->variant.cpu_mem_rw_args.buf =
                        g_malloc(args->variant.cpu_mem_rw_args.len);
                    rr_fcopy(args->variant.cpu_mem_rw_args.buf, 1,
                            args->variant.cpu_mem_rw_args.len,
                            oldlog, newlog);
                    break;
                case RR_CALL_CPU_MEM_UNMAP:
                    RR_COPY_ITEM(args->variant.cpu_mem_unmap);
                    args->variant.cpu_mem_unmap.buf =
                        g_malloc(args->variant.cpu_mem_unmap.len);
                    rr_fcopy(args->variant.cpu_mem_unmap.buf, 1,
                                args->variant.cpu_mem_unmap.len,
                                oldlog, newlog);
                    break;
                case RR_CALL_MEM_REGION_CHANGE:
                    RR_COPY_ITEM(args->variant.mem_region_change_args);
                    args->variant.mem_region_change_args.name =
                        g_malloc0(args->variant.mem_region_change_args.len + 1);
                    rr_fcopy(args->variant.mem_region_change_args.name, 1,
                            args->variant.mem_region_change_args.len,
                            oldlog, newlog);
                    break;
                case RR_CALL_HD_TRANSFER:
                    RR_COPY_ITEM(args->variant.hd_transfer_args);
                    break;
                case RR_CALL_NET_TRANSFER:
                    RR_COPY_ITEM(args->variant.net_transfer_args);
                    break;
                case RR_CALL_HANDLE_PACKET:
                    RR_COPY_ITEM(args->variant.handle_packet_args);
                    args->variant.handle_packet_args.buf =
                        g_malloc(args->variant.handle_packet_args.size);
                    rr_fcopy(args->variant.handle_packet_args.buf,
                            args->variant.handle_packet_args.size, 1,
                            oldlog, newlog);
                    break;
                default:
                    //mz unimplemented
                    sassert(0, 3);
            }
        } break;
        case RR_LAST:
            //mz nothing to read
            //ph We don't copy RR_LAST here; write out afterwards.
            break;
        default:
            //mz unimplemented
            sassert(0, 4);
    }

    return original_prog_point;
}
Beispiel #3
0
static INLINEIT void rr_fcopy(void *ptr, size_t size, size_t nmemb, FILE *oldlog, FILE *newlog) {
    rr_fread(ptr, size, nmemb, oldlog);
    rr_fwrite(ptr, size, nmemb, newlog);
}
Beispiel #4
0
void main(int argc, char *argv[]) {

  int i,j;

  char *vocab_filename;
  FILE *tempfile;
  char tempfiles_directory[1000];
  int vocab_size;
  FILE *vocab_file;

  int verbosity;

  int buffer_size;
  int position_in_buffer;
  int number_of_tempfiles;
  int max_files;
  int fof_size;

  unsigned short *buffer;
  unsigned short *placeholder;
  unsigned short *temp_ngram;
  int temp_count;
  
  char temp_word[500];
  char temp_word2[500];

  char *temp_file_root;
  char *temp_file_ext;
  char *host_name;
  int proc_id;
  struct utsname uname_info;

  flag write_ascii;

  /* Vocab hash table things */

  struct hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  tempfile = NULL; /* Just to prevent compilation warnings. */

  report_version(&argc,argv);

  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Process command line */
  
  if (pc_flagarg( &argc, argv,"-help") || argc==1) {
    fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n");
    fprintf(stderr,"Usage : text2idngram  -vocab .vocab \n");
    fprintf(stderr,"                    [ -buffer 100 ]\n");
    fprintf(stderr,"                    [ -hash %d ]\n",DEFAULT_HASH_SIZE);
    fprintf(stderr,"                    [ -temp %s ]\n",DEFAULT_TEMP);
    fprintf(stderr,"                    [ -files %d ]\n",DEFAULT_MAX_FILES);
    fprintf(stderr,"                    [ -gzip | -compress ]\n");
    fprintf(stderr,"                    [ -verbosity %d ]\n",
	    DEFAULT_VERBOSITY);
    fprintf(stderr,"                    [ -n 3 ]\n");
    fprintf(stderr,"                    [ -write_ascii ]\n");
    fprintf(stderr,"                    [ -fof_size 10 ]\n");
    exit(1);
  }

  pc_message(verbosity,2,"text2idngram\n");

  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);

  placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);

  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");

  fof_size = pc_intarg(&argc,argv,"-fof_size",10);

  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);

  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  
  if (!strcmp("",vocab_filename)) {
    quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n");
  }
    
  strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp", 
					   DEFAULT_TEMP));

  if (pc_flagarg(&argc,argv,"-compress")) {
    temp_file_ext = salloc(".Z");
  }
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) {
      temp_file_ext = salloc(".gz");
    }
    else {
      temp_file_ext = salloc("");
    }
  }

  uname(&uname_info);

  host_name = salloc(uname_info.nodename);

  proc_id = getpid();

  sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);

  temp_file_root = salloc(temp_word);

  pc_report_unk_args(&argc,argv,verbosity);
  
  /* If the last charactor in the directory name isn't a / then add one. */
  
  if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') {
    strcat(tempfiles_directory,"/");
  }
  
  pc_message(verbosity,2,"Vocab                  : %s\n",vocab_filename);
  pc_message(verbosity,2,"N-gram buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size        : %d\n",hash_size);
  pc_message(verbosity,2,"Temp directory         : %s\n",tempfiles_directory);
  pc_message(verbosity,2,"Max open files         : %d\n",max_files);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  
  pc_message(verbosity,2,"n                      : %d\n",n);

  buffer_size *= (1000000/(sizeof(unsigned short)*n));

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) {
      fprintf(stderr,"======================================================\n");
      fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word);
      fprintf(stderr,"=======================================================\n");
    }
    if (strncmp(temp_word,"#",1)==0) {
      fprintf(stderr,"\n\n===========================================================\n");
      fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
      fprintf(stderr,     ">>> %s <<<\n",temp_word);
      fprintf(stderr,     "         '%s' will be included in the vocabulary.\n",temp_word2);
      fprintf(stderr,     "         (comments must start with '##')\n");
      fprintf(stderr,"===========================================================\n\n");
    }
    vocab_size++;
    add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size);
  }

  if (vocab_size > MAX_VOCAB_SIZE) {
    quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n");
  }   
  
  pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n");

  buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short));

  number_of_tempfiles = 0;

  /* Read text into buffer */

  /* Read in the first ngram */

  position_in_buffer = 0;

  for (i=0;i<=n-1;i++) {
    get_word(stdin,temp_word);
    add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(stdin)) {

    /* Fill up the buffer */

    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
    while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) {
      position_in_buffer++;
      if (position_in_buffer % 20000 == 0) {
	if (position_in_buffer % 1000000 == 0) {
	  pc_message(verbosity,2,".\n");
	}
	else {
	  pc_message(verbosity,2,".");
	}
      }
      for (i=1;i<=n-1;i++) {
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      }
      if (get_word(stdin,temp_word) == 1) {
	add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) {
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);
    }

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");
    
    qsort((void*) buffer,(size_t) position_in_buffer,
	  n*sizeof(unsigned short),compare_ngrams);

    /* Output the buffer to temporary BINARY file */
    
    number_of_tempfiles++;

    sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    tempfile = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
      if (temp_ngram[i] > MAX_VOCAB_SIZE) {
	quit(-1,"Invalid trigram in buffer.\nAborting");

      }
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {
 
      if (!compare_ngrams(temp_ngram,&buffer[i*n])) {
	temp_count++;
      }
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1,
		    tempfile,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite(&temp_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	temp_count = 1;
      }
    }
    
    rr_oclose(tempfile);

    for (i=0;i<=n-1;i++) {
      add_to_buffer(placeholder[i],0,i,buffer);
    }

    position_in_buffer = 0;

  }

  /* Merge the temporary files, and output the result to standard output */

  pc_message(verbosity,2,"Merging temporary files...\n");
  
  merge_tempfiles(1,
		  number_of_tempfiles,
		  temp_file_root,
		  temp_file_ext,
		  max_files,
		  tempfiles_directory,
		  stdout,
		  write_ascii,
		  fof_size); 

  pc_message(verbosity,0,"text2idngram : Done.\n");

  exit(0);
  
}
Beispiel #5
0
int main(int argc, char *argv[]) {

  int verbosity;
  int vocab_size;
  FILE *vocab_file;
  int buffer_size;
  flag write_ascii;
  int max_files;
  int number_of_tempfiles;
  char *vocab_filename;
  char *idngram_filename;
  char temp_word[MAX_WORD_LENGTH];
  char temp_word2[MAX_WORD_LENGTH];
  char temp_word3[MAX_WORD_LENGTH];
  flag contains_unks;
  int position_in_buffer;
  FILE *outfile;
  FILE *tempfile;
  FILE *non_unk_fp;
  ngram_rec *buffer;
  flag same_ngram;
  int i;
  int j;
  int fof_size;
  int size_of_rec;

  char temp_directory[1000];
  char *temp_file_ext;

  /* Vocab hash table things */

  struct idngram_hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  wordid_t *current_ngram;
  int current_count;
  wordid_t *sort_ngram;
  int sort_count;
  
  /* Process command line */

  report_version(&argc,argv);
  
  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }


  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);
  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");
  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);
  fof_size = pc_intarg(&argc,argv,"-fof_size",10);
  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  idngram_filename = salloc(pc_stringarg( &argc, argv, "-idngram", "" ));
  
  if (!strcmp("",vocab_filename)) 
    quit(-1,"Error : Must specify a vocabulary file.\n");

  if (!strcmp("",idngram_filename)) 
    quit(-1,"text2idngram : Error : Must specify idngram file.\n");
    
  if (pc_flagarg(&argc,argv,"-compress")) 
    temp_file_ext = salloc(".Z");
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) 
      temp_file_ext = salloc(".gz");
    else 
      temp_file_ext = salloc("");
  }

  strcpy(temp_directory, "cmuclmtk-XXXXXX");
  if (mkdtemp(temp_directory) == NULL) {
     quit(-1, "Failed to create temporary folder: %s\n", strerror(errno));
  }

  pc_report_unk_args(&argc,argv,verbosity);

  outfile = rr_fopen(idngram_filename,"wb");
  
  pc_message(verbosity,2,"Vocab           : %s\n",vocab_filename);
  pc_message(verbosity,2,"Output idngram  : %s\n",idngram_filename);
  pc_message(verbosity,2,"Buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size : %d\n",hash_size);
  pc_message(verbosity,2,"Max open files  : %d\n",max_files);
  pc_message(verbosity,2,"n               : %d\n",n);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  

  size_of_rec = (sizeof(wordid_t) * n) + 16 - (( n* sizeof(wordid_t)) % 16);
  buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec)));
  fprintf(stderr,"buffer size = %d\n",buffer_size);

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_idngram_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for vocabulary order */
    if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) 
      quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n");

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) 
      warn_on_repeated_words(temp_word);

    warn_on_wrong_vocab_comments(temp_word);

    vocab_size++;
    
    add_to_idngram_hashtable(&vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size);
    strcpy(temp_word3,temp_word2);
  }

  if (vocab_size > MAX_VOCAB_SIZE) 
    quit(-1,"Error : Vocabulary size exceeds maximum.\n");
  
  pc_message(verbosity,2,"Allocating memory for the buffer...\n");

  buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec));
  
  for (i=0;i<=buffer_size;i++) 
    buffer[i].word = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Open the "non-OOV" tempfile */

  sprintf(temp_word, "%s/1%s", temp_directory, temp_file_ext);
  
  non_unk_fp = rr_fopen(temp_word,"w");

  pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n",
	     temp_word);
  number_of_tempfiles = 1;

  current_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));
  sort_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Read text into buffer */
  position_in_buffer = 0;

  while (!rr_feof(stdin)) {
    
    for (i=0;i<=n-1;i++) {
      get_word(stdin,temp_word);
      current_ngram[i]=index2(&vocabulary,temp_word);
    }
    if (scanf("%d",&current_count) != 1) 
      if (!rr_feof(stdin)) 
	quit(-1,"Error reading n-gram count from stdin.\n");

    if (!rr_feof(stdin)) {

      contains_unks = 0;
      for (i=0;i<=n-1;i++) {
	if (!current_ngram[i]) 
	  contains_unks = 1;
      }

      if (contains_unks) {
	/* Write to buffer */

	position_in_buffer++;

	if (position_in_buffer >= buffer_size) {

	  /* Sort buffer */
	  pc_message(verbosity,2,
		     "Sorting n-grams which include an OOV word...\n");

	  qsort((void*) buffer,(size_t) position_in_buffer,
		sizeof(ngram_rec),compare_ngrams2);

	  pc_message(verbosity,2,"Done.\n");

	  /* Write buffer to temporary file */

	  number_of_tempfiles++;
	  
	  sprintf(temp_word,"%s/%hu%s", temp_directory,
		  number_of_tempfiles,temp_file_ext);
	  
	  pc_message(verbosity,2,
		     "Writing sorted OOV-counts buffer to temporary file %s\n",
		     temp_word);

	  tempfile = rr_fopen(temp_word,"w");
	  
	  for (i=0;i<=n-1;i++) 
	    sort_ngram[i] = buffer[0].word[i];

	  sort_count = buffer[0].count;

	  for (i=0;i<=position_in_buffer-2;i++) {
	    
	    same_ngram = 1;
	    for (j=n-1;j>=0;j--) {
	      if (buffer[i].word[j] != sort_ngram[j]) {
		same_ngram = 0;
		j = -1;
	      }
	    }

	    if (same_ngram) 
	      sort_count += buffer[i].count;
	    else {
	      for (j=0;j<=n-1;j++) {
		rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
			  tempfile,"temporary n-gram ids");
		sort_ngram[j] = buffer[i].word[j];
	      }
	      rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
			"temporary n-gram counts");
	      sort_count = buffer[i].count;
	    }
	  }	    
	  for (j=0;j<=n-1;j++) 
	    rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		      tempfile,"temporary n-gram ids");

	  rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		    "temporary n-gram counts");
	  rr_oclose(tempfile);
	  position_in_buffer = 1;

	}
	
	for (i=0;i<=n-1;i++) 
	  buffer[position_in_buffer-1].word[i] = current_ngram[i];

	buffer[position_in_buffer-1].count = current_count;

      }else {
	/* Write to temporary file */
	for (i=0;i<=n-1;i++) 
	  rr_fwrite((char*)&current_ngram[i],sizeof(wordid_t),1,
		    non_unk_fp,"temporary n-gram ids");

	rr_fwrite((char*)&current_count,sizeof(int),1,non_unk_fp,
		  "temporary n-gram counts");
      }
    }
  }

  if (position_in_buffer > 0) {

    /* Only do this bit if we have actually seen some OOVs */
    /* Sort final buffer */    
    pc_message(verbosity,2,"Sorting final buffer...\n");

    qsort((void*) buffer,(size_t) position_in_buffer,
	  sizeof(ngram_rec),compare_ngrams2);
    
    /* Write final buffer */
    
    number_of_tempfiles++;
  
    sprintf(temp_word,"%s/%hu%s", temp_directory,
	    number_of_tempfiles,temp_file_ext);
    
    pc_message(verbosity,2,"Writing sorted buffer to temporary file %s\n", temp_word);

    tempfile = rr_fopen(temp_word,"w");
    
    for (i=0;i<=n-1;i++) 
      sort_ngram[i] = buffer[0].word[i];

    sort_count = buffer[0].count;
    
    for (i=1;i<=position_in_buffer-1;i++) {
      
      same_ngram = 1;
      for (j=n-1;j>=0;j--) {
	if (buffer[i].word[j] != sort_ngram[j]) {
	  same_ngram = 0;
	  j = -1;
	}
      }
      
      if (same_ngram) 
	sort_count += buffer[i].count;
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		    tempfile,"temporary n-gram ids");
	  sort_ngram[j] = buffer[i].word[j];
	}
	rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	sort_count = buffer[i].count;
      }
    }	    
    for (j=0;j<=n-1;j++) 
      rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		tempfile,"temporary n-gram ids");

    rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
	      "temporary n-gram counts");
    fclose(tempfile);
    

  }
  

  /* Merge the temporary files, and output the result */
  fclose(non_unk_fp);
  pc_message(verbosity,2,"Merging temporary files...\n");
  merge_idngramfiles(1,
		     number_of_tempfiles,
		     temp_directory,
		     temp_file_ext,
		     max_files,
		     outfile,
		     write_ascii,
		     fof_size,
		     n);

  fclose(outfile);

  rmdir(temp_directory);
  pc_message(verbosity,0,"wngram2idngram : Done.\n");

  return 0;
}
Beispiel #6
0
void main(int argc, char *argv[]) {

  int verbosity;
  int n;
  int m;
  int i;
  int input_type;
  int storage_type;
  unsigned short *current_ngram_int;
  unsigned short *previous_ngram_int;
  char **current_ngram_text;
  char **previous_ngram_text;
  int current_count;
  int running_total;
  flag same;
  flag first_one;
  flag got_to_eof;
   
  running_total = 0;

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help") || argc==1) {
    fprintf(stderr,"ngram2mgram - Convert an n-gram file to an m-gram file, where m<n\n");
    fprintf(stderr,"Usage : ngram2mgram   -n N -m M\n");
    fprintf(stderr,"                    [ -binary | -ascii | -words ]\n");
    fprintf(stderr,"                    < .ngram > .mgram\n");
    exit(1);
  }
 
  n = pc_intarg( &argc, argv,"-n",0);
  m = pc_intarg( &argc, argv,"-m",0);
  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
  

  input_type = 0;
  
  if (pc_flagarg( &argc, argv,"-binary")) {
    input_type = BINARY;
  }

  if (pc_flagarg( &argc, argv,"-ascii")) {
    if (input_type != 0) {
      quit(-1,"Error : more than one file format specified.\n");
    }
    input_type = ASCII;
  }

  if (pc_flagarg( &argc, argv,"-words")) {  
    if (input_type != 0) {
      quit(-1,"Error : more than one file format specified.\n");
    }
    input_type = WORDS;
  }    

  if (input_type == 0) {
    pc_message(verbosity,2,"Warning : no input type specified. Defaulting to binary.\n");
    input_type = BINARY;
  }

  if (n == 0) {
    quit(-1,"Must specify a value for n. Use the -n switch.\n");
  }

  if (m == 0) {
    quit(-1,"Must specify a value for m. Use the -m switch.\n");
  }
  
  if (n<=m) {
    quit(-1,"n must be greater than m.\n");
  }

  pc_report_unk_args(&argc,argv,verbosity);

  if (input_type == BINARY || input_type == ASCII) {
    storage_type = NUMERIC;
  }
  else {
    storage_type = ALPHA;
  }

  if (storage_type == NUMERIC) {
    current_ngram_int = (unsigned short *) 
      rr_malloc(n*sizeof(unsigned short));
    previous_ngram_int = (unsigned short *) 
      rr_malloc(n*sizeof(unsigned short));

    /* And to prevent compiler warnings ... */

    current_ngram_text = NULL;
    previous_ngram_text = NULL;
  }
  else {
    current_ngram_text = (char **) rr_malloc(n*sizeof(char *));
    previous_ngram_text = (char **) rr_malloc(n*sizeof(char *));
    for (i=0;i<=n-1;i++) {
      current_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
      previous_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
    }

    /* And to prevent compiler warnings ... */

    current_ngram_int = NULL;
    previous_ngram_int = NULL;

  }

  got_to_eof = 0;
  first_one = 1;

  while (!rr_feof(stdin)) {

    /* Store previous n-gram */

    if (!first_one) {

      if (storage_type == NUMERIC) {
	for (i=0;i<=n-1;i++) {
	  previous_ngram_int[i] = current_ngram_int[i];
	}
      }
      else {
	for (i=0;i<=n-1;i++) {
	  strcpy(previous_ngram_text[i],current_ngram_text[i]);
	}
      }

    }

    /* Read new n-gram */

    switch(input_type) {
    case BINARY:
      for (i=0;i<=n-1;i++) {
	rr_fread(&current_ngram_int[i],sizeof(id__t),1,stdin,
		 "from id_ngrams at stdin",0);
      }
      rr_fread(&current_count,sizeof(count_t),1,stdin,
	       "from id_ngrams file at stdin",0);
      break;
    case ASCII:
      for (i=0;i<=n-1;i++) {
	if (fscanf(stdin,"%hu",&current_ngram_int[i]) != 1) {
	  if (!rr_feof(stdin)) {
	    quit(-1,"Error reading id_ngram.\n");
	  }
	  else {
	    got_to_eof = 1;
	  }
	}
      }
      if (fscanf(stdin,"%d",&current_count) != 1) {
	if (!rr_feof(stdin)) {
	  quit(-1,"Error reading id_ngram.\n");
	}
	else {
	  got_to_eof = 1;
	}
      }
      break;
    case WORDS:
      for (i=0;i<=n-1;i++) {
	if (fscanf(stdin,"%s",current_ngram_text[i]) != 1) {
	  if (!rr_feof(stdin)) {
	    quit(-1,"Error reading id_ngram.\n");
	  }
	  else {
	    got_to_eof = 1;
	  }
	}
      }
      if (fscanf(stdin,"%d",&current_count) != 1) {
	if (!rr_feof(stdin)) {
	  quit(-1,"Error reading id_ngram.\n");
	}
	else {
	  got_to_eof = 1;
	}
      }
      break;
    }

    if (!got_to_eof) {

      /* Check for correct sorting */

      if (!first_one) {

	switch(storage_type) {
	case NUMERIC:
	  for (i=0;i<=n-1;i++) {
	    if (current_ngram_int[i]<previous_ngram_int[i]) {
	      quit(-1,"Error : ngrams not correctly sorted.\n");
	    }
	    else {
	      if (current_ngram_int[i]>previous_ngram_int[i]) {
		i=n;
	      }
	    }
	  }
	  break;
	case ALPHA:
	  for (i=0;i<=n-1;i++) {
	    if (strcmp(current_ngram_text[i],previous_ngram_text[i])<0) {
	      quit(-1,"Error : ngrams not correctly sorted.\n");
	    }
	    else {
	      if (strcmp(current_ngram_text[i],previous_ngram_text[i])>0) {
		i=n;
	      }
	    }
	  }
	  break;
	}
      }

      /* Compare this m-gram with previous m-gram */

      if (!first_one) {

	switch(storage_type) {
	case NUMERIC:
	  same = 1;
	  for (i=0;i<=m-1;i++) {
	    if (current_ngram_int[i] != previous_ngram_int[i]) {
	      same = 0;
	    }
	  }
	  if (same) {
	    running_total += current_count;
	  }
	  else {
	    if (input_type == ASCII) {
	      for (i=0;i<=m-1;i++) {
		printf("%d ",previous_ngram_int[i]);
	      }
	      printf("%d\n",running_total);
	    }
	    else {
	      for (i=0;i<=m-1;i++) {
		rr_fwrite(&previous_ngram_int[i],sizeof(id__t),1,stdout,
			  "to id_ngrams at stdout");
	      }
	      rr_fwrite(&running_total,sizeof(count_t),1,stdout,
			"to id n-grams at stdout");
	    }
	    running_total = current_count;
	  }
	  break;
	case ALPHA:
	  same = 1;
	  for (i=0;i<=m-1;i++) {
	    if (strcmp(current_ngram_text[i],previous_ngram_text[i])) {
	      same = 0;
	    }
	  }
	  if (same) {
	    running_total += current_count;
	  }
	  else {
	    for (i=0;i<=m-1;i++) {
	      printf("%s ",previous_ngram_text[i]);
	    }
	    printf("%d\n",running_total);
	    running_total = current_count;
	  
	  }
	  break;
	}
      
      }
      else {
	running_total = current_count;
      } 
    
      first_one = 0;
    
    }
  }

  /* Write out final m-gram */

  switch(input_type) {
  case BINARY:
    break;
  case ASCII:
    for (i=0;i<=m-1;i++) {
      printf("%d ",previous_ngram_int[i]);
    }
    printf("%d\n",running_total);
    break;
  case WORDS:
    for (i=0;i<=m-1;i++) {
      printf("%s ",previous_ngram_text[i]);
    }
    printf("%d\n",running_total);
    break;
  } 

  pc_message(verbosity,0,"ngram2mgram : Done.\n");

  exit(0);

}	  
Beispiel #7
0
void merge_idngramfiles (int start_file, 
		      int end_file, 
		      char *temp_file_root,
		      char *temp_file_ext,
		      int max_files,
		      FILE *outfile,
		      flag write_ascii,
		      int fof_size,
		      int n_order) {
  FILE *new_temp_file;
  char temp_string[1000];
  char *new_temp_filename;
  
  FILE **temp_file;
  char **temp_filename;
  wordid_t **current_ngram;
  wordid_t *smallest_ngram;
  wordid_t *previous_ngram;

  int *current_ngram_count;
  flag *finished;
  flag all_finished;
  int temp_count;
  int i,j;
  flag first_ngram;
  fof_t **fof_array;
  ngram_sz_t *num_kgrams;
  int *ng_count;
  int pos_of_novelty;
  
  n = n_order;
  
  pos_of_novelty = n; /* Simply for warning-free compilation */
  num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t));
  ng_count = (int *) rr_calloc(n-1,sizeof(int));
  first_ngram = 1;
  
  previous_ngram = (wordid_t *) rr_calloc(n,sizeof(wordid_t));
  temp_file = (FILE **) rr_malloc(sizeof(FILE *) * (end_file-start_file+1));
  temp_filename = (char **) rr_malloc(sizeof(char *) * 
				      (end_file-start_file+1));

  /* should change to 2d array*/
  current_ngram = (wordid_t **) rr_malloc(sizeof(wordid_t *) * 
						(end_file-start_file+1));
  for (i=0;i<=end_file-start_file;i++) 
    current_ngram[i] = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  current_ngram_count = (int *) rr_malloc(sizeof(int)*(end_file-start_file+1));

  finished = (flag *) rr_malloc(sizeof(flag)*(end_file-start_file+1));
  smallest_ngram = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  /* should change to 2d array*/
  fof_array = (fof_t **) rr_malloc(sizeof(fof_t *)*(n-1));
  for (i=0;i<=n-2;i++) 
    fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t));

  if (end_file-start_file+1 > max_files) {
    sprintf(temp_string,"%s/%hu%s",temp_file_root,
	    end_file+1,temp_file_ext);
    new_temp_filename = salloc(temp_string);
    new_temp_file = rr_oopen(new_temp_filename);
    merge_tempfiles(start_file,start_file+max_files-1,
		    temp_file_root,temp_file_ext,max_files,
		    new_temp_file,write_ascii,0);
    merge_tempfiles(start_file+max_files,end_file+1,
		    temp_file_root,temp_file_ext,max_files,
		    outfile,write_ascii,0);
  }else {

    /* Open all the temp files for reading */
    for (i=0;i<=end_file-start_file;i++) {
      sprintf(temp_string,"%s/%hu%s",temp_file_root,
	      i+start_file,temp_file_ext);
      temp_filename[i] = salloc(temp_string);
      temp_file[i] = rr_iopen(temp_filename[i]);
    }
    
    /* Now go through the files simultaneously, and write out the appropriate
       ngram counts to the output file. */

    for (i=end_file-start_file;i>=0;i--) {
      finished[i] = 0;
      if (!rr_feof(temp_file[i])) {
	for (j=0;j<=n-1;j++) {
	  rr_fread((char*) &current_ngram[i][j], sizeof(wordid_t),1,
		   temp_file[i],"temporary n-gram ids",0);
	}    
	rr_fread((char*) &current_ngram_count[i], sizeof(int),1,
		 temp_file[i],"temporary n-gram counts",0);
      }
    }
    
    all_finished = 0;

    while (!all_finished) {

      /* Find the smallest current ngram */
      for (i=0;i<=n-1;i++) 
	smallest_ngram[i] = MAX_WORDID;

      for (i=0;i<=end_file-start_file;i++) {
	if (!finished[i]) {
	  if (compare_ngrams3(smallest_ngram,current_ngram[i]) < 0) {
	    for (j=0;j<n;j++)
	      smallest_ngram[j] = current_ngram[i][j];
	  }
	}
      }

#if MAX_VOCAB_SIZE < 65535
      /* This check is well-meaning but completely useless since
	 smallest_ngram[i] by definition cannot contain any value
	 greater than MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */
      for (i=0;i<=n-1;i++) {
	if (smallest_ngram[i] > MAX_VOCAB_SIZE) {
	  quit(-1,"Error : Temporary files corrupted, invalid n-gram found.\n");
	}
      }
#endif
	  
      /* For each of the files that are currently holding this ngram,
	 add its count to the temporary count, and read in a new ngram
	 from the files. */

      temp_count = 0;

      for (i=0;i<=end_file-start_file;i++) {
	if (!finished[i]) {
	  if (compare_ngrams3(smallest_ngram,current_ngram[i]) == 0) {
	    temp_count = temp_count + current_ngram_count[i];
	    if (!rr_feof(temp_file[i])) {
	      for (j=0;j<=n-1;j++) {
		rr_fread((char*) &current_ngram[i][j],sizeof(wordid_t),1,
			 temp_file[i],"temporary n-gram ids",0);
	      }
	      rr_fread((char*)&current_ngram_count[i],sizeof(int),1,
		       temp_file[i],"temporary n-gram count",0);
	    }else {
	      finished[i] = 1;
	      all_finished = 1;
	      for (j=0;j<=end_file-start_file;j++) {
		if (!finished[j]) 
		  all_finished = 0;
	      }
	    }
	  }
	}
      }
      
      if (write_ascii) {
	for (i=0;i<=n-1;i++) {

	  if (fprintf(outfile,"%d ",smallest_ngram[i]) < 0) 
	    {
	      quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
	    }
	}
	if (fprintf(outfile,"%d\n",temp_count) < 0)  
	  quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");

      }else {
	for (i=0;i<=n-1;i++) {
	  rr_fwrite((char*)&smallest_ngram[i],sizeof(wordid_t),1,
		    outfile,"n-gram ids");
	}
	rr_fwrite((char*)&temp_count,sizeof(count_t),1,outfile,"n-gram counts");		   
      }

      if (fof_size > 0 && n>1) { /* Add stuff to fof arrays */
	
	/* Code from idngram2stats */	
	pos_of_novelty = n;
	for (i=0;i<=n-1;i++) {
	  if (smallest_ngram[i] > previous_ngram[i]) {
	    pos_of_novelty = i;
	    i=n;
	  }
	}
	  
	/* Add new N-gram */
	  
	num_kgrams[n-2]++;
	if (temp_count <= fof_size)
	  fof_array[n-2][temp_count]++;

	if (!first_ngram) {
	  for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
	    num_kgrams[i-1]++;
	    if (ng_count[i-1] <= fof_size) {
	      fof_array[i-1][ng_count[i-1]]++;
	    }
	    ng_count[i-1] = temp_count;
	  }
	}else {
	  for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
	    ng_count[i-1] = temp_count;
	  }
	  first_ngram = 0;
	}
	  
	for (i=0;i<=pos_of_novelty-2;i++)
	  ng_count[i] += temp_count;

	for (i=0;i<=n-1;i++)
	  previous_ngram[i]=smallest_ngram[i];

      }
    }
    
    for (i=0;i<=end_file-start_file;i++) {
      fclose(temp_file[i]);
      remove(temp_filename[i]); 
    }
    
  }    

  if (fof_size > 0 && n>1) { /* Display fof arrays */

    /* Process last ngram */
    for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
      num_kgrams[i-1]++;
      if (ng_count[i-1] <= fof_size)
	fof_array[i-1][ng_count[i-1]]++;

      ng_count[i-1] = temp_count;
    }
    
    for (i=0;i<=pos_of_novelty-2;i++)
      ng_count[i] += temp_count;

    display_fof_array(num_kgrams,fof_array,fof_size,stderr, n);

  }

}
Beispiel #8
0
/*
  @return number_of_tempfiles
 */
int  read_txt2ngram_buffer(FILE* infp, 
			   struct idngram_hash_table *vocabulary, 
			   int32 verbosity,
			   wordid_t *buffer,
			   int buffer_size,
			   unsigned int n,
			   char* temp_file_root,
			   char* temp_file_ext,
			   FILE* temp_file
			   )
{
  /* Read text into buffer */
  char temp_word[MAX_WORD_LENGTH];
  int position_in_buffer;
  int number_of_tempfiles;
  unsigned int i,j;
  wordid_t *placeholder;
  wordid_t *temp_ngram;
  int temp_count;

#if 1
  int tmpval;
#endif

  temp_ngram  = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);
  placeholder = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  ng=n;

  position_in_buffer = 0;
  number_of_tempfiles = 0;

  //tk: looks like things may croak if the corpus has less than n words
  //not that such a corpus would be useful anyway
  for (i=0;i<=n-1;i++) {
    get_word(infp,temp_word);
    /*
        fprintf(stderr,"%s \n",temp_word);
	fprintf(stderr,"%d \n",index2(vocabulary,temp_word));
        fflush(stderr);
    */
    add_to_buffer(index2(vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(infp)) {
    /* Fill up the buffer */
    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

    while ((position_in_buffer<buffer_size) && (!rr_feof(infp))) {
      position_in_buffer++;
      show_idngram_nlines(position_in_buffer,verbosity);

      for (i=1;i<=n-1;i++) 
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      
      if (get_word(infp,temp_word) == 1) {
      /*
	fprintf(stderr,"%s \n",temp_word);
	fprintf(stderr,"%d \n",index2(vocabulary,temp_word));
	fflush(stderr);
      */
	add_to_buffer(index2(vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) 
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");    
    
    qsort((void*) buffer,(size_t) position_in_buffer,n*sizeof(wordid_t),compare_ngrams);

    /* Output the buffer to temporary BINARY file */    
    number_of_tempfiles++;

    sprintf(temp_word,"%s/%hu%s",temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    temp_file = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
#if MAX_VOCAB_SIZE < 65535
      /* This check is well-meaning but completely useless since
	 buffer_contents() can never return something greater than
	 MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */
      if (temp_ngram[i] > MAX_VOCAB_SIZE)
	quit(-1,"Invalid trigram in buffer.\nAborting");
#endif
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {

      tmpval=compare_ngrams(temp_ngram,&buffer[i*n]);

      /*      for(k=0;k<=n-1;k++){
	fprintf(stderr, "tmpval: %d k %d, temp_ngram %d, &buffer[i*n] %d\n",tmpval, k, temp_ngram[k], (&buffer[i*n])[k]);
	}*/

      if (!compare_ngrams(temp_ngram,&buffer[i*n])) 
	temp_count++;
      else {
	/*	printf("Have been here?\n");*/
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*) &temp_ngram[j],sizeof(wordid_t),1,
		    temp_file,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite((char*)&temp_count,sizeof(int),1,temp_file,
		  "temporary n-gram counts");

	/*	for(j=0 ; j<=n-1;j++)
	  fprintf(stderr,"%d ",temp_ngram[j]);
	  fprintf(stderr,"%d\n",temp_count);*/

	temp_count = 1;
      }
    }
    
    rr_oclose(temp_file);

    for (i=0;i<=n-1;i++) 
      add_to_buffer(placeholder[i],0,i,buffer);

    position_in_buffer = 0;

  }

  return number_of_tempfiles;
}
Beispiel #9
0
void write_bin_lm(ng_t *ng,int verbosity) {
    
    int l_chunk;
    int from_rec;
    int i;
    
    pc_message(verbosity,1,"Binary %d-gram language model will be written to %s\n",ng->n,ng->bin_filename);
    
    ng->version = BBO_FILE_VERSION;
    
    /* Scalar parameters */
    
    rr_fwrite((char*)&ng->version,sizeof(int),1,ng->bin_fp,"version");
    rr_fwrite((char*)&ng->n,sizeof(unsigned short),1,ng->bin_fp,"n");
    
    rr_fwrite((char*)&ng->vocab_size,sizeof(wordid_t),1,ng->bin_fp,"vocab_size");
    rr_fwrite((char*)&ng->no_of_ccs,sizeof(unsigned short),1,ng->bin_fp,"no_of_ccs");
    rr_fwrite((char*)&ng->vocab_type,sizeof(unsigned short),1,ng->bin_fp,"vocab_type");
    
    rr_fwrite((char*)&ng->count_table_size,sizeof(count_ind_t),1,ng->bin_fp,"count_table_size");
    rr_fwrite((char*)&ng->discounting_method,sizeof(unsigned short),1,ng->bin_fp,"discounting_method");
    
    rr_fwrite((char*)&ng->min_alpha,sizeof(double),1,ng->bin_fp,"min_alpha");
    rr_fwrite((char*)&ng->max_alpha,sizeof(double),1,ng->bin_fp,"max_alpha");
    rr_fwrite((char*)&ng->out_of_range_alphas,sizeof(unsigned short),1,ng->bin_fp,"out_of_range_alphas");
    rr_fwrite((char*)&ng->size_of_alpha_array,sizeof(unsigned short),1,ng->bin_fp,"size_of_alpha_array");  
    
    rr_fwrite((char*)&ng->n_unigrams,sizeof(ngram_sz_t),1,ng->bin_fp,"n_unigrams");
    rr_fwrite((char*)&ng->zeroton_fraction,sizeof(double),1,ng->bin_fp,"zeroton_fraction");
    rr_fwrite((char*)&ng->oov_fraction,sizeof(double),1,ng->bin_fp,"oov_fraction");
    rr_fwrite((char*)&ng->four_byte_counts,sizeof(flag),1,ng->bin_fp,"four_byte_counts");
    rr_fwrite((char*)&ng->four_byte_alphas,sizeof(flag),1,ng->bin_fp,"four_byte_alphas");
    
    rr_fwrite((char*)&ng->first_id,sizeof(unsigned short),1,
              ng->bin_fp,"first_id");
    
    /* Short and shortish arrays */
    
    sih_val_write_to_file(ng->vocab_ht,ng->bin_fp,ng->bin_filename,0);
    
    /* (ng->vocab is not stored in file - will be derived from ng->vocab_ht) */
    
    if (ng->four_byte_counts==1) {
        assert(ng->marg_counts4);
        rr_fwrite((char*)ng->marg_counts4,sizeof(count_t),
                  ng->vocab_size+1,ng->bin_fp,"marg_counts");
    }else {
        assert(ng->marg_counts);
        rr_fwrite((char*)ng->marg_counts,sizeof(count_ind_t),
                  ng->vocab_size+1,ng->bin_fp,"marg_counts");
    }
    
    rr_fwrite((char*)ng->alpha_array,sizeof(double),
              ng->size_of_alpha_array,ng->bin_fp,"alpha_array");
    
    if (!ng->four_byte_counts) {
        for (i=0;i<=ng->n-1;i++)
            rr_fwrite((char*)ng->count_table[i],sizeof(count_t),
                      ng->count_table_size+1,ng->bin_fp,"count_table");
    }
    
    /* Could write count_table as one block, but better to be safe and
     do it in chunks. For motivation, see comments about writing tree
     info. */
    
    rr_fwrite((char*)ng->ptr_table_size,sizeof(ptr_tab_sz_t),ng->n,ng->bin_fp,"ptr_table_size");
    
    for (i=0;i<=ng->n-1;i++)
        rr_fwrite((char*)ng->ptr_table[i],sizeof(ptr_tab_t),ng->ptr_table_size[i],ng->bin_fp,"ptr_table");
    
    /* Unigram statistics */
    
    rr_fwrite((char*)ng->uni_probs,sizeof(uni_probs_t), ng->vocab_size+1,
              ng->bin_fp,"uni_probs");
    rr_fwrite((char*)ng->uni_log_probs,sizeof(uni_probs_t),ng->vocab_size+1,
              ng->bin_fp,"uni_log_probs");
    rr_fwrite((char*)ng->context_cue,sizeof(flag),ng->vocab_size+1,
              ng->bin_fp,"context_cue");
    
    rr_fwrite((char*)ng->cutoffs,sizeof(cutoff_t),ng->n,ng->bin_fp,"cutoffs");
    
    switch (ng->discounting_method) {
        case GOOD_TURING:
            rr_fwrite((char*)ng->fof_size,sizeof(fof_sz_t),ng->n,ng->bin_fp,"fof_size");
            rr_fwrite((char*)ng->disc_range,sizeof(unsigned short),ng->n,
                      ng->bin_fp,"disc_range");
            for (i=0;i<=ng->n-1;i++) {
                rr_fwrite((char*)ng->freq_of_freq[i],sizeof(fof_t),
                          ng->fof_size[i]+1,ng->bin_fp,"freq_of_freq");
            }    
            for (i=0;i<=ng->n-1;i++) {
                rr_fwrite((char*)ng->gt_disc_ratio[i],sizeof(disc_val_t),
                          ng->disc_range[i]+1,ng->bin_fp,"gt_disc_ratio");
            }    
        case WITTEN_BELL:
            break;
        case LINEAR:
            rr_fwrite((char*)ng->lin_disc_ratio,sizeof(disc_val_t),
                      ng->n,ng->bin_fp,"lin_disc_ratio");
            break;
        case ABSOLUTE:
            rr_fwrite((char*)ng->abs_disc_const,sizeof(double),
                      ng->n,ng->bin_fp,"abs_disc_const");
            break;
    }
    
    /* Tree information */
    
    /* Unigram stuff first, since can be dumped all in one go */
    
    rr_fwrite((char*)ng->num_kgrams,sizeof(ngram_sz_t),ng->n,ng->bin_fp,"num_kgrams");
    
    if (ng->four_byte_counts)
        rr_fwrite((char*)ng->count4[0],sizeof(count_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram counts");
    else 
        rr_fwrite((char*)ng->count[0],sizeof(count_ind_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram counts");
    
    if (ng->four_byte_alphas)
        rr_fwrite((char*)ng->bo_weight4[0],sizeof(four_byte_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram backoff weights");
    else
        rr_fwrite((char*)ng->bo_weight[0],sizeof(bo_weight_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram backoff weights");
    
    if (ng->n > 1) 
        rr_fwrite((char*)ng->ind[0],sizeof(index__t),ng->vocab_size+1,
                  ng->bin_fp,"unigram -> bigram pointers");
    
    /* Write the rest of the tree structure in chunks, otherwise the
     kernel buffers are too big. */
    
    /* Need to do byte swapping */
    swap_struct(ng);
    
    
    for (i=1;i<=ng->n-1;i++) {
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i]) 
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            rr_fwrite((char*)&ng->word_id[i][from_rec],1,sizeof(id__t)*l_chunk,ng->bin_fp,"word ids");
            
            from_rec += l_chunk;
        }   
    }
    
    for (i=1;i<=ng->n-1;i++) {
        
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i])
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            if (ng->four_byte_counts)
                rr_fwrite((char*)&ng->count4[i][from_rec],1,sizeof(count_t)*l_chunk,ng->bin_fp,"counts");
            else
                rr_fwrite((char*)&ng->count[i][from_rec],1,sizeof(count_ind_t)*l_chunk,ng->bin_fp,"counts");
            
            from_rec += l_chunk;
        }    
    }
    
    for (i=1;i<=ng->n-2;i++) {
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i]) 
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            if (ng->four_byte_alphas)
                rr_fwrite((char*)&ng->bo_weight4[i][from_rec],1,sizeof(four_byte_t)*l_chunk,
                          ng->bin_fp,"backoff weights");
            else
                rr_fwrite((char*)&ng->bo_weight[i][from_rec],1,sizeof(bo_weight_t)*l_chunk,
                          ng->bin_fp,"backoff weights");
            from_rec += l_chunk;
        }
    }
    
    for (i=1;i<=ng->n-2;i++) {
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i])
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            rr_fwrite((char*)&ng->ind[i][from_rec],1,sizeof(index__t)*l_chunk,ng->bin_fp,
                      "indices");
            from_rec += l_chunk;
        }
    }
    
    rr_oclose(ng->bin_fp);
    
    /* Swap back */
    swap_struct(ng); 
}