Пример #1
0
void main(int argc, char *argv[]) {

  int i,j;

  char *vocab_filename;
  FILE *tempfile;
  char tempfiles_directory[1000];
  int vocab_size;
  FILE *vocab_file;

  int verbosity;

  int buffer_size;
  int position_in_buffer;
  int number_of_tempfiles;
  int max_files;
  int fof_size;

  unsigned short *buffer;
  unsigned short *placeholder;
  unsigned short *temp_ngram;
  int temp_count;
  
  char temp_word[500];
  char temp_word2[500];

  char *temp_file_root;
  char *temp_file_ext;
  char *host_name;
  int proc_id;
  struct utsname uname_info;

  flag write_ascii;

  /* Vocab hash table things */

  struct hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  tempfile = NULL; /* Just to prevent compilation warnings. */

  report_version(&argc,argv);

  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Process command line */
  
  if (pc_flagarg( &argc, argv,"-help") || argc==1) {
    fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n");
    fprintf(stderr,"Usage : text2idngram  -vocab .vocab \n");
    fprintf(stderr,"                    [ -buffer 100 ]\n");
    fprintf(stderr,"                    [ -hash %d ]\n",DEFAULT_HASH_SIZE);
    fprintf(stderr,"                    [ -temp %s ]\n",DEFAULT_TEMP);
    fprintf(stderr,"                    [ -files %d ]\n",DEFAULT_MAX_FILES);
    fprintf(stderr,"                    [ -gzip | -compress ]\n");
    fprintf(stderr,"                    [ -verbosity %d ]\n",
	    DEFAULT_VERBOSITY);
    fprintf(stderr,"                    [ -n 3 ]\n");
    fprintf(stderr,"                    [ -write_ascii ]\n");
    fprintf(stderr,"                    [ -fof_size 10 ]\n");
    exit(1);
  }

  pc_message(verbosity,2,"text2idngram\n");

  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);

  placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);

  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");

  fof_size = pc_intarg(&argc,argv,"-fof_size",10);

  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);

  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  
  if (!strcmp("",vocab_filename)) {
    quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n");
  }
    
  strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp", 
					   DEFAULT_TEMP));

  if (pc_flagarg(&argc,argv,"-compress")) {
    temp_file_ext = salloc(".Z");
  }
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) {
      temp_file_ext = salloc(".gz");
    }
    else {
      temp_file_ext = salloc("");
    }
  }

  uname(&uname_info);

  host_name = salloc(uname_info.nodename);

  proc_id = getpid();

  sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);

  temp_file_root = salloc(temp_word);

  pc_report_unk_args(&argc,argv,verbosity);
  
  /* If the last charactor in the directory name isn't a / then add one. */
  
  if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') {
    strcat(tempfiles_directory,"/");
  }
  
  pc_message(verbosity,2,"Vocab                  : %s\n",vocab_filename);
  pc_message(verbosity,2,"N-gram buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size        : %d\n",hash_size);
  pc_message(verbosity,2,"Temp directory         : %s\n",tempfiles_directory);
  pc_message(verbosity,2,"Max open files         : %d\n",max_files);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  
  pc_message(verbosity,2,"n                      : %d\n",n);

  buffer_size *= (1000000/(sizeof(unsigned short)*n));

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) {
      fprintf(stderr,"======================================================\n");
      fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word);
      fprintf(stderr,"=======================================================\n");
    }
    if (strncmp(temp_word,"#",1)==0) {
      fprintf(stderr,"\n\n===========================================================\n");
      fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
      fprintf(stderr,     ">>> %s <<<\n",temp_word);
      fprintf(stderr,     "         '%s' will be included in the vocabulary.\n",temp_word2);
      fprintf(stderr,     "         (comments must start with '##')\n");
      fprintf(stderr,"===========================================================\n\n");
    }
    vocab_size++;
    add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size);
  }

  if (vocab_size > MAX_VOCAB_SIZE) {
    quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n");
  }   
  
  pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n");

  buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short));

  number_of_tempfiles = 0;

  /* Read text into buffer */

  /* Read in the first ngram */

  position_in_buffer = 0;

  for (i=0;i<=n-1;i++) {
    get_word(stdin,temp_word);
    add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(stdin)) {

    /* Fill up the buffer */

    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
    while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) {
      position_in_buffer++;
      if (position_in_buffer % 20000 == 0) {
	if (position_in_buffer % 1000000 == 0) {
	  pc_message(verbosity,2,".\n");
	}
	else {
	  pc_message(verbosity,2,".");
	}
      }
      for (i=1;i<=n-1;i++) {
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      }
      if (get_word(stdin,temp_word) == 1) {
	add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) {
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);
    }

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");
    
    qsort((void*) buffer,(size_t) position_in_buffer,
	  n*sizeof(unsigned short),compare_ngrams);

    /* Output the buffer to temporary BINARY file */
    
    number_of_tempfiles++;

    sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    tempfile = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
      if (temp_ngram[i] > MAX_VOCAB_SIZE) {
	quit(-1,"Invalid trigram in buffer.\nAborting");

      }
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {
 
      if (!compare_ngrams(temp_ngram,&buffer[i*n])) {
	temp_count++;
      }
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1,
		    tempfile,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite(&temp_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	temp_count = 1;
      }
    }
    
    rr_oclose(tempfile);

    for (i=0;i<=n-1;i++) {
      add_to_buffer(placeholder[i],0,i,buffer);
    }

    position_in_buffer = 0;

  }

  /* Merge the temporary files, and output the result to standard output */

  pc_message(verbosity,2,"Merging temporary files...\n");
  
  merge_tempfiles(1,
		  number_of_tempfiles,
		  temp_file_root,
		  temp_file_ext,
		  max_files,
		  tempfiles_directory,
		  stdout,
		  write_ascii,
		  fof_size); 

  pc_message(verbosity,0,"text2idngram : Done.\n");

  exit(0);
  
}
Пример #2
0
void compute_perplexity(ng_t *ng,
			arpa_lm_t *arpa_ng,
			char *text_stream_filename,
			char *probs_stream_filename,
			char *annotation_filename,
			char *oov_filename,
			char *fb_list_filename,
			flag backoff_from_unk_inc,
			flag backoff_from_unk_exc,
			flag backoff_from_ccs_inc,
			flag backoff_from_ccs_exc,
			flag arpa_lm,
			flag include_unks,
			double log_base) {

  fb_info *fb_list;
  FILE *temp_fp;
  FILE *text_stream_fp;
  FILE *probs_stream_fp;
  FILE *annotation_fp;
  FILE *oov_fp;
  flag out_probs;
  flag annotate;
  flag out_oovs;
  flag found_unk_wrongly;
  double prob;
  double sum_log_prob;
  int total_words;
  int excluded_unks;
  int excluded_ccs;
  char current_word[1000];  /* Hope that's big enough */
  char **prev_words;
  vocab_sz_t current_id;
  id__t short_current_id;
  id__t *context;
  int context_length;
  int i;
  int bo_case;
  int actual_context_length;
  int *ngrams_hit;
  int n;

  /* Initialise file pointers to prevent warnings from the compiler. */

  probs_stream_fp = NULL;
  annotation_fp = NULL;
  oov_fp = NULL;

  short_current_id = 0;

  found_unk_wrongly = 0;

  annotate = 0;

  bo_case = 0;

  if (arpa_lm) {
    n = arpa_ng->n;
    fb_list = gen_fb_list(arpa_ng->vocab_ht,
			  (int) arpa_ng->vocab_size,
			  arpa_ng->vocab,
			  arpa_ng->context_cue,
			  backoff_from_unk_inc,
			  backoff_from_unk_exc,
			  backoff_from_ccs_inc,
			  backoff_from_ccs_exc,
			  fb_list_filename);
  }else {
    n = ng->n;
    fb_list = gen_fb_list(ng->vocab_ht,
			  (int) ng->vocab_size,
			  ng->vocab,
			  ng->context_cue,
			  backoff_from_unk_inc,
			  backoff_from_unk_exc,
			  backoff_from_ccs_inc,
			  backoff_from_ccs_exc,
			  fb_list_filename);
  }
  
  ngrams_hit = (int *) rr_calloc(n,sizeof(int));
  prev_words = (char **) rr_malloc(sizeof(char *)*n);
  for (i=0;i<=n-1;i++)
    prev_words[i] = (char *) rr_malloc(sizeof(char)*1000);
  
  /* Check that text_stream_filename and probs_stream_filename (if
     specified) are valid. Note that the checks employed by the
     standard rr_fopen tools are not suitable here, since we don't
     want the program to terminate if the paths are not found. */

  if (!strcmp(text_stream_filename,"")) {
    printf("Error : Must specify a text file. Use the -text switch.\n");
    return;
  }

  if (!rr_fexists(text_stream_filename) && strcmp(text_stream_filename,"-")) {
    printf("Error : Can't open file %s for reading.\n",text_stream_filename);
    return;
  }

  out_probs = strcmp(probs_stream_filename,"");
  annotate = strcmp(annotation_filename,"");
  out_oovs = strcmp(oov_filename,"");

  printf("Computing perplexity of the language model with respect\n");
  printf("   to the text %s\n",text_stream_filename);
  if (out_probs)
    printf("Probability stream will be written to file %s\n",
	    probs_stream_filename);

  if (annotate)
    printf("Annotation will be written to file %s\n",
	    annotation_filename);

  if (out_oovs)
    printf("Out of vocabulary words will be written to file %s\n",
	    oov_filename);

  if (backoff_from_unk_inc)
    printf("Will force inclusive back-off from OOVs.\n");

  if (backoff_from_unk_exc)
    printf("Will force exclusive back-off from OOVs.\n");

  if (backoff_from_ccs_inc)
    printf("Will force inclusive back-off from context cues.\n");

  if (backoff_from_ccs_exc)
    printf("Will force exclusive back-off from context cues.\n");

  if (strcmp(fb_list_filename,"")) 
    printf("Will force back-off according to the contents of %s\n",
	    fb_list_filename);

  if (include_unks)
    printf("Perplexity calculation will include OOVs.\n");

  /* Check for existance of files, as rr functions will quit, which isn't
     what we want */

  if (out_probs && strcmp(probs_stream_filename,"-")) {
    if ((temp_fp = fopen(probs_stream_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",probs_stream_filename);
      return;
    }
    fclose(temp_fp);
  }

  if (annotate && strcmp(annotation_filename,"-")) {
    if ((temp_fp = fopen(annotation_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",annotation_filename);
      return;
    }
    fclose(temp_fp);
  }
    
  if (out_oovs && strcmp(oov_filename,"-")) {
    if ((temp_fp = fopen(oov_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",oov_filename);
      return;
    }
    fclose(temp_fp);
  }

  text_stream_fp = rr_iopen(text_stream_filename);
  if (out_probs)
    probs_stream_fp = rr_oopen(probs_stream_filename);

  if (annotate)
    annotation_fp = rr_oopen(annotation_filename);

  if (out_oovs)
    oov_fp = rr_oopen(oov_filename);

  context = (id__t *) rr_malloc(sizeof(id__t)*(n-1));

  sum_log_prob = 0.0;
  total_words = 0;
  excluded_unks = 0;
  excluded_ccs = 0;

  while (!rr_feof(text_stream_fp)) {

    if (total_words > 0) {
      if (total_words < n)
	strcpy(prev_words[total_words-1],current_word);
      else {
	for (i=0;i<=n-3;i++)
	  strcpy(prev_words[i],prev_words[i+1]);

	if (n>1)
	  strcpy(prev_words[n-2],current_word);
      }
    }

    if (total_words < (n-1))
      context_length = total_words;
    else
      context_length = n-1;

    /* Fill context with right stuff */

    if (total_words > (n-1)) {
      for (i=0;i<=context_length-2;i++)
	context[i] = context[i+1];
    }

    if (context_length != 0)
      context[context_length-1] = short_current_id;

    if (fscanf(text_stream_fp,"%s",current_word) != 1) {
      if (!rr_feof(text_stream_fp)) {
	printf("Error reading text file.\n");
	return;
      }
    }

    if (!rr_feof(text_stream_fp)) {

      if (arpa_lm) {
	sih_lookup(arpa_ng->vocab_ht,current_word,&current_id);
	if (arpa_ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
	  found_unk_wrongly = 1;
	  printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
	}
	if (current_id > arpa_ng->vocab_size)
	  quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
	else
	  short_current_id = current_id;

      }else {
	sih_lookup(ng->vocab_ht,current_word,&current_id);
	if (ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
	  found_unk_wrongly = 1;
	  printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
	}
	if (current_id > ng->vocab_size)
	  quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
	else
	  short_current_id = current_id;

      }
    
      if (!found_unk_wrongly) {

	if (current_id == 0 && out_oovs)
	  fprintf(oov_fp,"%s\n",current_word);

	if ((arpa_lm && (!(arpa_ng->context_cue[current_id])))
	    || ((!arpa_lm) && (!(ng->context_cue[current_id])))) {

	  if (include_unks || current_id != 0) {

	    prob = calc_prob_of(short_current_id,
				context,
				context_length,
				ng,
				arpa_ng,
				fb_list,
				&bo_case,
				&actual_context_length,
				arpa_lm);


	    if (prob<= 0.0 || prob > 1.0) {
	      fprintf(stderr,"Warning : ");
	      if (short_current_id == 0)
		fprintf(stderr,"P( <UNK> | ");
	      else
		fprintf(stderr,"P( %s | ",current_word);
	  
	      for (i=0;i<=actual_context_length-1;i++) {
		if (context[i+context_length-actual_context_length] == 0)
		  fprintf(stderr,"<UNK> ");
		else
		  fprintf(stderr,"%s ",prev_words[i]);
	      }
	      fprintf(stderr,") = %g logprob = %g \n ",prob,log(prob)/log(log_base));
	      fprintf(stderr,"bo_case == 0x%dx, actual_context_length == %d\n",
		      bo_case, actual_context_length);
	    }
	  
	    if (annotate) {
	      if (short_current_id == 0)
		fprintf(annotation_fp,"P( <UNK> | ");
	      else 
		fprintf(annotation_fp,"P( %s | ",current_word);
	  
	      for (i=0;i<=actual_context_length-1;i++) {
		if (context[i+context_length-actual_context_length] == 0)
		  fprintf(annotation_fp,"<UNK> ");
		else {
		  if (arpa_lm)
		    fprintf(annotation_fp,"%s ",arpa_ng->vocab[context[i+context_length-actual_context_length]]);
		  else
		    fprintf(annotation_fp,"%s ",ng->vocab[context[i+context_length-actual_context_length]]);
		}
	      }
	      fprintf(annotation_fp,") = %g logprob = %f bo_case = ",prob,log(prob)/log(log_base));
	      decode_bo_case(bo_case,actual_context_length,annotation_fp);
	    }

	    /* Calculate level to which we backed off */

	    for (i=actual_context_length-1;i>=0;i--) {
 	      int four_raise_i = 1<<(2*i);  /* PWP */
 
 	      /*
 	       * PWP: This was "if ((bo_case / (int) pow(3,i)) == 0)"
 	       * but was getting a divide-by-zero error on an Alpha
 	       * (it isn't clear to me why it should ever have done so)
 	       * Anyway, it is much faster to do in base-4.
 	       */

	      if ((bo_case == 0) || ((bo_case / four_raise_i) == 0)) {
		ngrams_hit[i+1]++;
		i = -2;
	      }else
		bo_case -= ((bo_case / four_raise_i) * four_raise_i);
	    }
  
	    if (i != -3) 
	      ngrams_hit[0]++;

	    if (out_probs)
	      fprintf(probs_stream_fp,"%g\n",prob);
      
	    sum_log_prob += log10(prob);
			  
	  }

          if (current_id == 0 && !include_unks)
            excluded_unks++;
	}       
	else {
	  if (((!arpa_lm) && ng->context_cue[current_id]) || 
	      (arpa_lm && arpa_ng->context_cue[current_id]))
	    excluded_ccs++;
	}
	total_words++;
      }
    }
  }

  if (!found_unk_wrongly) {      /*  pow(x,y) = e**(y  ln(x)) */
    printf("Perplexity = %.2f, Entropy = %.2f bits\n", 
	    exp(-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
		log(10.0)),
	   (-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
	    log(10.0) / log(2.0)));
    printf("Computation based on %d words.\n",
	   total_words-excluded_ccs-excluded_unks);
    for(i=n;i>=1;i--) {
      printf("Number of %d-grams hit = %d  (%.2f%%)\n",i,ngrams_hit[i-1],
	     (float) 100*ngrams_hit[i-1]/(total_words-excluded_ccs-excluded_unks) );
    }
    printf("%d OOVs (%.2f%%) and %d context cues were removed from the calculation.\n",
	   excluded_unks,
	   (float) 100*excluded_unks/(total_words-excluded_ccs),excluded_ccs);
    
  }

  rr_iclose(text_stream_fp);

  if (out_probs)
    rr_oclose(probs_stream_fp);
  if (annotate)
    rr_oclose(annotation_fp);
  if (out_oovs)
    rr_oclose(oov_fp);

  free (fb_list);
  free (context);
  free (ngrams_hit);
}
Пример #3
0
int main (int argc, char **argv) {

  int n;
  int verbosity;
  int max_files;
  int max_words;
  int max_chars;

  int current_word;
  int current_char;
  int start_char;		/* start boundary (possibly > than 0) */

  int no_of_spaces;
  int pos_in_string;

  int i;
  char *current_string;
  char current_temp_filename[500];
  int current_file_number;
  FILE *temp_file;

  flag text_buffer_full;

  char *text_buffer;
  char **pointers;

  char current_ngram[500];
  int current_count;

  int counter;

  char temp_directory[1000];
  char *temp_file_ext;

  flag words_set;
  flag chars_set;

  /* Process command line */

  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);
  pc_message(verbosity,2,"text2wngram\n");

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help")) {
    help_message();
    exit(1);
  }

  n = pc_intarg(&argc, argv,"-n",DEFAULT_N);

  /*  max_words = pc_intarg(&argc, argv,"-words",STD_MEM*1000000/11);
  max_chars = pc_intarg(&argc, argv,"-chars",STD_MEM*7000000/11); */

  max_words = pc_intarg(&argc, argv,"-words",-1);
  max_chars = pc_intarg(&argc, argv,"-chars",-1);

  if (max_words == -1) {
    words_set = 0;
    max_words = STD_MEM*1000000/11;
  }else
    words_set = 1;

  if (max_chars == -1) {
    chars_set = 0;
    max_chars = STD_MEM*7000000/11; 
  }else
    chars_set = 1;
  
  max_files = pc_intarg(&argc, argv,"-files",DEFAULT_MAX_FILES);

  if (pc_flagarg(&argc,argv,"-compress"))
    temp_file_ext = salloc(".Z");
  else {
    if (pc_flagarg(&argc,argv,"-gzip"))
      temp_file_ext = salloc(".gz");
    else
      temp_file_ext = salloc("");
  }

  strcpy(temp_directory, "cmuclmtk-XXXXXX");
  if (mkdtemp(temp_directory) == NULL) {
     quit(-1, "Failed to create temporary folder: %s\n", strerror(errno));
  }

  pc_report_unk_args(&argc,argv,verbosity);
 
  if (words_set && !chars_set)
    max_chars = max_words * 7;

  if (!words_set && chars_set)
    max_words = max_chars / 7;

  /* If the last charactor in the directory name isn't a / then add one. */
  
  pc_message(verbosity,2,"n = %d\n",n);
  pc_message(verbosity,2,"Number of words in buffer = %d\n",max_words);
  pc_message(verbosity,2,"Number of chars in buffer = %d\n",max_chars);
  pc_message(verbosity,2,"Max number of files open at once = %d\n",max_files);
  pc_message(verbosity,2,"Temporary directory = %s\n",temp_directory);

  /* Allocate memory for the buffers */

  text_buffer = (char *) rr_malloc(sizeof(char)*max_chars);
  pc_message(verbosity,2,"Allocated %d bytes to text buffer.\n",
	     sizeof(char)*max_chars);

  pointers = (char **) rr_malloc(sizeof(char *)*max_words);
  pc_message(verbosity,2,"Allocated %d bytes to pointer array.\n",
	     sizeof(char *)*max_words);

  current_file_number = 0;

  current_word = 1;
  start_char = 0;
  current_char = 0;
  counter = 0;
  pointers[0] = text_buffer;
      
  while (!feof(stdin)) {

    current_file_number++;

    /* Read text into buffer */
    
    pc_message(verbosity,2,"Reading text into buffer...\n");

    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 words processed for each \".\", 1,000,000 for each line.\n");
    
    pointers[0] = text_buffer;
    
    while ((!rr_feof(stdin)) && 
	   (current_word < max_words) && 
	   (current_char < max_chars)) {

      text_buffer[current_char] = getchar();
      if (text_buffer[current_char] == '\n' || 
	  text_buffer[current_char] == '\t' ) {
	text_buffer[current_char] = ' ';
      }
      if (text_buffer[current_char] == ' ') {
	if (current_char > start_char) {
	  if (text_buffer[current_char-1] == ' ') {
	    current_word--;
	    current_char--;
	  }
	  pointers[current_word] = &(text_buffer[current_char+1]);
	  current_word++; 
	  counter++;
	  if (counter % 20000 == 0) {
	    if (counter % 1000000 == 0)
	      pc_message(verbosity,2,"\n");
	    else
	      pc_message(verbosity,2,".");
	  }
	}
      }
      
      if (text_buffer[current_char] != ' ' || current_char > start_char) 
	current_char++;
    }

    text_buffer[current_char]='\0';


    if (current_word == max_words || rr_feof(stdin)) {
      for (i=current_char+1;i<=max_chars-1;i++)
	text_buffer[i] = ' ';

      text_buffer_full = 0;
    }else
      text_buffer_full = 1;
    
    /* Sort buffer */

    pc_message(verbosity,2,"\nSorting pointer array...\n"); 

    qsort((void *) pointers,(size_t) current_word-n,sizeof(char *),cmp_strings);
   
    /* Write out temporary file */

    sprintf(current_temp_filename,"%s/%hu%s",temp_directory, current_file_number, temp_file_ext);

    pc_message(verbosity,2,"Writing out temporary file %s...\n",current_temp_filename);
        
    temp_file = rr_oopen(current_temp_filename);
    text_buffer[current_char] = ' ';
    
    current_count = 0;
    strcpy(current_ngram,"");
    
    for (i = 0; i <= current_word-n; i++) {
      current_string = pointers[i];
      
      /* Find the nth space */

      no_of_spaces = 0;
      pos_in_string = 0;
      while (no_of_spaces < n) {	
	if (current_string[pos_in_string] == ' ')
	  no_of_spaces++;

	pos_in_string++;
      }
      
      if (!strncmp(current_string,current_ngram,pos_in_string))
	current_count++;
      else {
	if (strcmp(current_ngram,""))
	  if (fprintf(temp_file,"%s %d\n",current_ngram,current_count) < 0) 
	    quit(-1,"Error writing to temporary file %s\n",current_temp_filename);

	current_count = 1;
	strncpy(current_ngram,current_string,pos_in_string);
	current_ngram[pos_in_string] = '\0';
      }
    }
    
    rr_oclose(temp_file);

    /* Move the last n-1 words to the beginning of the buffer, and set
       correct current_word and current_char things */

    strcpy(text_buffer,pointers[current_word-n]);
    pointers[0]=text_buffer;
   
    /* Find the (n-1)th space */

    no_of_spaces=0;
    pos_in_string=0;

    if (!text_buffer_full){ 
      while (no_of_spaces<(n-1)) {
	if (pointers[0][pos_in_string]==' ') {
	  no_of_spaces++;
	  pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
	}
	pos_in_string++;
      }
    }else {
      while (no_of_spaces<n) {
	if (pointers[0][pos_in_string]==' ') {
	  no_of_spaces++;
	  pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
	}
	pos_in_string++;
      }
      pos_in_string--;
    }

    current_char = pos_in_string;
    current_word = n;
    /* mark boundary beyond which counting pass cannot backup */
    start_char = current_char;

  }
  /* Merge temporary files */

  pc_message(verbosity,2,"Merging temporary files...\n");

  merge_tempfiles(1,
		  current_file_number,
		  temp_directory,
		  temp_file_ext,
		  max_files,
		  stdout,
		  n,
		  verbosity); 

  rmdir(temp_directory);
  pc_message(verbosity,0,"text2wngram : Done.\n");
  
  return 0;
}
Пример #4
0
int main(int argc, char *argv[]) {

  int verbosity;
  int vocab_size;
  FILE *vocab_file;
  int buffer_size;
  flag write_ascii;
  int max_files;
  int number_of_tempfiles;
  char *vocab_filename;
  char *idngram_filename;
  char temp_word[MAX_WORD_LENGTH];
  char temp_word2[MAX_WORD_LENGTH];
  char temp_word3[MAX_WORD_LENGTH];
  flag contains_unks;
  int position_in_buffer;
  FILE *outfile;
  FILE *tempfile;
  FILE *non_unk_fp;
  ngram_rec *buffer;
  flag same_ngram;
  int i;
  int j;
  int fof_size;
  int size_of_rec;

  char temp_directory[1000];
  char *temp_file_ext;

  /* Vocab hash table things */

  struct idngram_hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  wordid_t *current_ngram;
  int current_count;
  wordid_t *sort_ngram;
  int sort_count;
  
  /* Process command line */

  report_version(&argc,argv);
  
  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }


  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);
  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");
  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);
  fof_size = pc_intarg(&argc,argv,"-fof_size",10);
  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  idngram_filename = salloc(pc_stringarg( &argc, argv, "-idngram", "" ));
  
  if (!strcmp("",vocab_filename)) 
    quit(-1,"Error : Must specify a vocabulary file.\n");

  if (!strcmp("",idngram_filename)) 
    quit(-1,"text2idngram : Error : Must specify idngram file.\n");
    
  if (pc_flagarg(&argc,argv,"-compress")) 
    temp_file_ext = salloc(".Z");
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) 
      temp_file_ext = salloc(".gz");
    else 
      temp_file_ext = salloc("");
  }

  strcpy(temp_directory, "cmuclmtk-XXXXXX");
  if (mkdtemp(temp_directory) == NULL) {
     quit(-1, "Failed to create temporary folder: %s\n", strerror(errno));
  }

  pc_report_unk_args(&argc,argv,verbosity);

  outfile = rr_fopen(idngram_filename,"wb");
  
  pc_message(verbosity,2,"Vocab           : %s\n",vocab_filename);
  pc_message(verbosity,2,"Output idngram  : %s\n",idngram_filename);
  pc_message(verbosity,2,"Buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size : %d\n",hash_size);
  pc_message(verbosity,2,"Max open files  : %d\n",max_files);
  pc_message(verbosity,2,"n               : %d\n",n);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  

  size_of_rec = (sizeof(wordid_t) * n) + 16 - (( n* sizeof(wordid_t)) % 16);
  buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec)));
  fprintf(stderr,"buffer size = %d\n",buffer_size);

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_idngram_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for vocabulary order */
    if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) 
      quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n");

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) 
      warn_on_repeated_words(temp_word);

    warn_on_wrong_vocab_comments(temp_word);

    vocab_size++;
    
    add_to_idngram_hashtable(&vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size);
    strcpy(temp_word3,temp_word2);
  }

  if (vocab_size > MAX_VOCAB_SIZE) 
    quit(-1,"Error : Vocabulary size exceeds maximum.\n");
  
  pc_message(verbosity,2,"Allocating memory for the buffer...\n");

  buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec));
  
  for (i=0;i<=buffer_size;i++) 
    buffer[i].word = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Open the "non-OOV" tempfile */

  sprintf(temp_word, "%s/1%s", temp_directory, temp_file_ext);
  
  non_unk_fp = rr_fopen(temp_word,"w");

  pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n",
	     temp_word);
  number_of_tempfiles = 1;

  current_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));
  sort_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Read text into buffer */
  position_in_buffer = 0;

  while (!rr_feof(stdin)) {
    
    for (i=0;i<=n-1;i++) {
      get_word(stdin,temp_word);
      current_ngram[i]=index2(&vocabulary,temp_word);
    }
    if (scanf("%d",&current_count) != 1) 
      if (!rr_feof(stdin)) 
	quit(-1,"Error reading n-gram count from stdin.\n");

    if (!rr_feof(stdin)) {

      contains_unks = 0;
      for (i=0;i<=n-1;i++) {
	if (!current_ngram[i]) 
	  contains_unks = 1;
      }

      if (contains_unks) {
	/* Write to buffer */

	position_in_buffer++;

	if (position_in_buffer >= buffer_size) {

	  /* Sort buffer */
	  pc_message(verbosity,2,
		     "Sorting n-grams which include an OOV word...\n");

	  qsort((void*) buffer,(size_t) position_in_buffer,
		sizeof(ngram_rec),compare_ngrams2);

	  pc_message(verbosity,2,"Done.\n");

	  /* Write buffer to temporary file */

	  number_of_tempfiles++;
	  
	  sprintf(temp_word,"%s/%hu%s", temp_directory,
		  number_of_tempfiles,temp_file_ext);
	  
	  pc_message(verbosity,2,
		     "Writing sorted OOV-counts buffer to temporary file %s\n",
		     temp_word);

	  tempfile = rr_fopen(temp_word,"w");
	  
	  for (i=0;i<=n-1;i++) 
	    sort_ngram[i] = buffer[0].word[i];

	  sort_count = buffer[0].count;

	  for (i=0;i<=position_in_buffer-2;i++) {
	    
	    same_ngram = 1;
	    for (j=n-1;j>=0;j--) {
	      if (buffer[i].word[j] != sort_ngram[j]) {
		same_ngram = 0;
		j = -1;
	      }
	    }

	    if (same_ngram) 
	      sort_count += buffer[i].count;
	    else {
	      for (j=0;j<=n-1;j++) {
		rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
			  tempfile,"temporary n-gram ids");
		sort_ngram[j] = buffer[i].word[j];
	      }
	      rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
			"temporary n-gram counts");
	      sort_count = buffer[i].count;
	    }
	  }	    
	  for (j=0;j<=n-1;j++) 
	    rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		      tempfile,"temporary n-gram ids");

	  rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		    "temporary n-gram counts");
	  rr_oclose(tempfile);
	  position_in_buffer = 1;

	}
	
	for (i=0;i<=n-1;i++) 
	  buffer[position_in_buffer-1].word[i] = current_ngram[i];

	buffer[position_in_buffer-1].count = current_count;

      }else {
	/* Write to temporary file */
	for (i=0;i<=n-1;i++) 
	  rr_fwrite((char*)&current_ngram[i],sizeof(wordid_t),1,
		    non_unk_fp,"temporary n-gram ids");

	rr_fwrite((char*)&current_count,sizeof(int),1,non_unk_fp,
		  "temporary n-gram counts");
      }
    }
  }

  if (position_in_buffer > 0) {

    /* Only do this bit if we have actually seen some OOVs */
    /* Sort final buffer */    
    pc_message(verbosity,2,"Sorting final buffer...\n");

    qsort((void*) buffer,(size_t) position_in_buffer,
	  sizeof(ngram_rec),compare_ngrams2);
    
    /* Write final buffer */
    
    number_of_tempfiles++;
  
    sprintf(temp_word,"%s/%hu%s", temp_directory,
	    number_of_tempfiles,temp_file_ext);
    
    pc_message(verbosity,2,"Writing sorted buffer to temporary file %s\n", temp_word);

    tempfile = rr_fopen(temp_word,"w");
    
    for (i=0;i<=n-1;i++) 
      sort_ngram[i] = buffer[0].word[i];

    sort_count = buffer[0].count;
    
    for (i=1;i<=position_in_buffer-1;i++) {
      
      same_ngram = 1;
      for (j=n-1;j>=0;j--) {
	if (buffer[i].word[j] != sort_ngram[j]) {
	  same_ngram = 0;
	  j = -1;
	}
      }
      
      if (same_ngram) 
	sort_count += buffer[i].count;
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		    tempfile,"temporary n-gram ids");
	  sort_ngram[j] = buffer[i].word[j];
	}
	rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	sort_count = buffer[i].count;
      }
    }	    
    for (j=0;j<=n-1;j++) 
      rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		tempfile,"temporary n-gram ids");

    rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
	      "temporary n-gram counts");
    fclose(tempfile);
    

  }
  

  /* Merge the temporary files, and output the result */
  fclose(non_unk_fp);
  pc_message(verbosity,2,"Merging temporary files...\n");
  merge_idngramfiles(1,
		     number_of_tempfiles,
		     temp_directory,
		     temp_file_ext,
		     max_files,
		     outfile,
		     write_ascii,
		     fof_size,
		     n);

  fclose(outfile);

  rmdir(temp_directory);
  pc_message(verbosity,0,"wngram2idngram : Done.\n");

  return 0;
}
Пример #5
0
/*
  @return number_of_tempfiles
 */
int  read_txt2ngram_buffer(FILE* infp, 
			   struct idngram_hash_table *vocabulary, 
			   int32 verbosity,
			   wordid_t *buffer,
			   int buffer_size,
			   unsigned int n,
			   char* temp_file_root,
			   char* temp_file_ext,
			   FILE* temp_file
			   )
{
  /* Read text into buffer */
  char temp_word[MAX_WORD_LENGTH];
  int position_in_buffer;
  int number_of_tempfiles;
  unsigned int i,j;
  wordid_t *placeholder;
  wordid_t *temp_ngram;
  int temp_count;

#if 1
  int tmpval;
#endif

  temp_ngram  = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);
  placeholder = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  ng=n;

  position_in_buffer = 0;
  number_of_tempfiles = 0;

  //tk: looks like things may croak if the corpus has less than n words
  //not that such a corpus would be useful anyway
  for (i=0;i<=n-1;i++) {
    get_word(infp,temp_word);
    /*
        fprintf(stderr,"%s \n",temp_word);
	fprintf(stderr,"%d \n",index2(vocabulary,temp_word));
        fflush(stderr);
    */
    add_to_buffer(index2(vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(infp)) {
    /* Fill up the buffer */
    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

    while ((position_in_buffer<buffer_size) && (!rr_feof(infp))) {
      position_in_buffer++;
      show_idngram_nlines(position_in_buffer,verbosity);

      for (i=1;i<=n-1;i++) 
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      
      if (get_word(infp,temp_word) == 1) {
      /*
	fprintf(stderr,"%s \n",temp_word);
	fprintf(stderr,"%d \n",index2(vocabulary,temp_word));
	fflush(stderr);
      */
	add_to_buffer(index2(vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) 
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");    
    
    qsort((void*) buffer,(size_t) position_in_buffer,n*sizeof(wordid_t),compare_ngrams);

    /* Output the buffer to temporary BINARY file */    
    number_of_tempfiles++;

    sprintf(temp_word,"%s/%hu%s",temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    temp_file = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
#if MAX_VOCAB_SIZE < 65535
      /* This check is well-meaning but completely useless since
	 buffer_contents() can never return something greater than
	 MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */
      if (temp_ngram[i] > MAX_VOCAB_SIZE)
	quit(-1,"Invalid trigram in buffer.\nAborting");
#endif
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {

      tmpval=compare_ngrams(temp_ngram,&buffer[i*n]);

      /*      for(k=0;k<=n-1;k++){
	fprintf(stderr, "tmpval: %d k %d, temp_ngram %d, &buffer[i*n] %d\n",tmpval, k, temp_ngram[k], (&buffer[i*n])[k]);
	}*/

      if (!compare_ngrams(temp_ngram,&buffer[i*n])) 
	temp_count++;
      else {
	/*	printf("Have been here?\n");*/
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*) &temp_ngram[j],sizeof(wordid_t),1,
		    temp_file,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite((char*)&temp_count,sizeof(int),1,temp_file,
		  "temporary n-gram counts");

	/*	for(j=0 ; j<=n-1;j++)
	  fprintf(stderr,"%d ",temp_ngram[j]);
	  fprintf(stderr,"%d\n",temp_count);*/

	temp_count = 1;
      }
    }
    
    rr_oclose(temp_file);

    for (i=0;i<=n-1;i++) 
      add_to_buffer(placeholder[i],0,i,buffer);

    position_in_buffer = 0;

  }

  return number_of_tempfiles;
}
Пример #6
0
void write_bin_lm(ng_t *ng,int verbosity) {
    
    int l_chunk;
    int from_rec;
    int i;
    
    pc_message(verbosity,1,"Binary %d-gram language model will be written to %s\n",ng->n,ng->bin_filename);
    
    ng->version = BBO_FILE_VERSION;
    
    /* Scalar parameters */
    
    rr_fwrite((char*)&ng->version,sizeof(int),1,ng->bin_fp,"version");
    rr_fwrite((char*)&ng->n,sizeof(unsigned short),1,ng->bin_fp,"n");
    
    rr_fwrite((char*)&ng->vocab_size,sizeof(wordid_t),1,ng->bin_fp,"vocab_size");
    rr_fwrite((char*)&ng->no_of_ccs,sizeof(unsigned short),1,ng->bin_fp,"no_of_ccs");
    rr_fwrite((char*)&ng->vocab_type,sizeof(unsigned short),1,ng->bin_fp,"vocab_type");
    
    rr_fwrite((char*)&ng->count_table_size,sizeof(count_ind_t),1,ng->bin_fp,"count_table_size");
    rr_fwrite((char*)&ng->discounting_method,sizeof(unsigned short),1,ng->bin_fp,"discounting_method");
    
    rr_fwrite((char*)&ng->min_alpha,sizeof(double),1,ng->bin_fp,"min_alpha");
    rr_fwrite((char*)&ng->max_alpha,sizeof(double),1,ng->bin_fp,"max_alpha");
    rr_fwrite((char*)&ng->out_of_range_alphas,sizeof(unsigned short),1,ng->bin_fp,"out_of_range_alphas");
    rr_fwrite((char*)&ng->size_of_alpha_array,sizeof(unsigned short),1,ng->bin_fp,"size_of_alpha_array");  
    
    rr_fwrite((char*)&ng->n_unigrams,sizeof(ngram_sz_t),1,ng->bin_fp,"n_unigrams");
    rr_fwrite((char*)&ng->zeroton_fraction,sizeof(double),1,ng->bin_fp,"zeroton_fraction");
    rr_fwrite((char*)&ng->oov_fraction,sizeof(double),1,ng->bin_fp,"oov_fraction");
    rr_fwrite((char*)&ng->four_byte_counts,sizeof(flag),1,ng->bin_fp,"four_byte_counts");
    rr_fwrite((char*)&ng->four_byte_alphas,sizeof(flag),1,ng->bin_fp,"four_byte_alphas");
    
    rr_fwrite((char*)&ng->first_id,sizeof(unsigned short),1,
              ng->bin_fp,"first_id");
    
    /* Short and shortish arrays */
    
    sih_val_write_to_file(ng->vocab_ht,ng->bin_fp,ng->bin_filename,0);
    
    /* (ng->vocab is not stored in file - will be derived from ng->vocab_ht) */
    
    if (ng->four_byte_counts==1) {
        assert(ng->marg_counts4);
        rr_fwrite((char*)ng->marg_counts4,sizeof(count_t),
                  ng->vocab_size+1,ng->bin_fp,"marg_counts");
    }else {
        assert(ng->marg_counts);
        rr_fwrite((char*)ng->marg_counts,sizeof(count_ind_t),
                  ng->vocab_size+1,ng->bin_fp,"marg_counts");
    }
    
    rr_fwrite((char*)ng->alpha_array,sizeof(double),
              ng->size_of_alpha_array,ng->bin_fp,"alpha_array");
    
    if (!ng->four_byte_counts) {
        for (i=0;i<=ng->n-1;i++)
            rr_fwrite((char*)ng->count_table[i],sizeof(count_t),
                      ng->count_table_size+1,ng->bin_fp,"count_table");
    }
    
    /* Could write count_table as one block, but better to be safe and
     do it in chunks. For motivation, see comments about writing tree
     info. */
    
    rr_fwrite((char*)ng->ptr_table_size,sizeof(ptr_tab_sz_t),ng->n,ng->bin_fp,"ptr_table_size");
    
    for (i=0;i<=ng->n-1;i++)
        rr_fwrite((char*)ng->ptr_table[i],sizeof(ptr_tab_t),ng->ptr_table_size[i],ng->bin_fp,"ptr_table");
    
    /* Unigram statistics */
    
    rr_fwrite((char*)ng->uni_probs,sizeof(uni_probs_t), ng->vocab_size+1,
              ng->bin_fp,"uni_probs");
    rr_fwrite((char*)ng->uni_log_probs,sizeof(uni_probs_t),ng->vocab_size+1,
              ng->bin_fp,"uni_log_probs");
    rr_fwrite((char*)ng->context_cue,sizeof(flag),ng->vocab_size+1,
              ng->bin_fp,"context_cue");
    
    rr_fwrite((char*)ng->cutoffs,sizeof(cutoff_t),ng->n,ng->bin_fp,"cutoffs");
    
    switch (ng->discounting_method) {
        case GOOD_TURING:
            rr_fwrite((char*)ng->fof_size,sizeof(fof_sz_t),ng->n,ng->bin_fp,"fof_size");
            rr_fwrite((char*)ng->disc_range,sizeof(unsigned short),ng->n,
                      ng->bin_fp,"disc_range");
            for (i=0;i<=ng->n-1;i++) {
                rr_fwrite((char*)ng->freq_of_freq[i],sizeof(fof_t),
                          ng->fof_size[i]+1,ng->bin_fp,"freq_of_freq");
            }    
            for (i=0;i<=ng->n-1;i++) {
                rr_fwrite((char*)ng->gt_disc_ratio[i],sizeof(disc_val_t),
                          ng->disc_range[i]+1,ng->bin_fp,"gt_disc_ratio");
            }    
        case WITTEN_BELL:
            break;
        case LINEAR:
            rr_fwrite((char*)ng->lin_disc_ratio,sizeof(disc_val_t),
                      ng->n,ng->bin_fp,"lin_disc_ratio");
            break;
        case ABSOLUTE:
            rr_fwrite((char*)ng->abs_disc_const,sizeof(double),
                      ng->n,ng->bin_fp,"abs_disc_const");
            break;
    }
    
    /* Tree information */
    
    /* Unigram stuff first, since can be dumped all in one go */
    
    rr_fwrite((char*)ng->num_kgrams,sizeof(ngram_sz_t),ng->n,ng->bin_fp,"num_kgrams");
    
    if (ng->four_byte_counts)
        rr_fwrite((char*)ng->count4[0],sizeof(count_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram counts");
    else 
        rr_fwrite((char*)ng->count[0],sizeof(count_ind_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram counts");
    
    if (ng->four_byte_alphas)
        rr_fwrite((char*)ng->bo_weight4[0],sizeof(four_byte_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram backoff weights");
    else
        rr_fwrite((char*)ng->bo_weight[0],sizeof(bo_weight_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram backoff weights");
    
    if (ng->n > 1) 
        rr_fwrite((char*)ng->ind[0],sizeof(index__t),ng->vocab_size+1,
                  ng->bin_fp,"unigram -> bigram pointers");
    
    /* Write the rest of the tree structure in chunks, otherwise the
     kernel buffers are too big. */
    
    /* Need to do byte swapping */
    swap_struct(ng);
    
    
    for (i=1;i<=ng->n-1;i++) {
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i]) 
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            rr_fwrite((char*)&ng->word_id[i][from_rec],1,sizeof(id__t)*l_chunk,ng->bin_fp,"word ids");
            
            from_rec += l_chunk;
        }   
    }
    
    for (i=1;i<=ng->n-1;i++) {
        
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i])
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            if (ng->four_byte_counts)
                rr_fwrite((char*)&ng->count4[i][from_rec],1,sizeof(count_t)*l_chunk,ng->bin_fp,"counts");
            else
                rr_fwrite((char*)&ng->count[i][from_rec],1,sizeof(count_ind_t)*l_chunk,ng->bin_fp,"counts");
            
            from_rec += l_chunk;
        }    
    }
    
    for (i=1;i<=ng->n-2;i++) {
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i]) 
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            if (ng->four_byte_alphas)
                rr_fwrite((char*)&ng->bo_weight4[i][from_rec],1,sizeof(four_byte_t)*l_chunk,
                          ng->bin_fp,"backoff weights");
            else
                rr_fwrite((char*)&ng->bo_weight[i][from_rec],1,sizeof(bo_weight_t)*l_chunk,
                          ng->bin_fp,"backoff weights");
            from_rec += l_chunk;
        }
    }
    
    for (i=1;i<=ng->n-2;i++) {
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i])
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            rr_fwrite((char*)&ng->ind[i][from_rec],1,sizeof(index__t)*l_chunk,ng->bin_fp,
                      "indices");
            from_rec += l_chunk;
        }
    }
    
    rr_oclose(ng->bin_fp);
    
    /* Swap back */
    swap_struct(ng); 
}
Пример #7
0
void write_arpa_lm(ng_t *ng,int verbosity) {
    
    int *current_pos;
    int *end_pos;
    ngram_sz_t i;
    double log_10_of_e = 1.0 / log(10.0);
    
    /* HEADER */
    
    pc_message(verbosity,1,"ARPA-style %d-gram will be written to %s\n",ng->n,ng->arpa_filename);
    
    write_arpa_copyright(ng->arpa_fp,ng->n,ng->vocab_size, ng->vocab[1],ng->vocab[2],ng->vocab[3]);
    
    display_vocabtype(ng->vocab_type,ng->oov_fraction, ng->arpa_fp);  
    display_discounting_method(ng,ng->arpa_fp);
    write_arpa_format(ng->arpa_fp,ng->n);
    write_arpa_num_grams(ng->arpa_fp,ng,NULL,0);
    write_arpa_k_gram_header(ng->arpa_fp,1);
    
    for (i=ng->first_id; i<= (int) ng->vocab_size;i++) {
        
        double log10_uniprob;
        double log10_alpha;
        double alpha;
        
        log10_uniprob = ng->uni_log_probs[i]*log_10_of_e;
        
        if (ng->uni_probs[i]<=0.0)
            log10_uniprob = BAD_LOG_PROB;
        
        alpha=ng_double_alpha(ng,0,i);
        
        if(alpha > 0.0)
            log10_alpha = log10(alpha);
        else
            log10_alpha = BAD_LOG_PROB;
        
        fprintf(ng->arpa_fp,"%.4f %s",log10_uniprob,ng->vocab[i]);
        if (ng->n>1)
            fprintf(ng->arpa_fp,"\t%.4f\n",log10_alpha);
        else
            fprintf(ng->arpa_fp,"\n");
    }
    
    current_pos = (int *) rr_malloc(ng->n*sizeof(int));
    end_pos = (int *) rr_malloc(ng->n*sizeof(int)); 
    
    /* Print 2-gram, ... (n-1)-gram info. */
    
    for (i=1;i<=ng->n-1;i++) {
        
        /* Print out the (i+1)-gram */
        
        int current_table, j;
        count_t ngcount, marg_count;
        double discounted_ngcount;    
        double ngprob, log_10_ngprob, ngalpha, log_10_ngalpha;
        
        /* Initialise variables for the sake of warning-free compilation */
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__
        
        discounted_ngcount = 0.0;
        log_10_ngalpha = 0.0;
#endif
        write_arpa_k_gram_header(ng->arpa_fp,i+1);
        
        /* Go through the n-gram list in order */
        
        for (j=0;j<=ng->n-1;j++) {
            current_pos[j] = 0;
            end_pos[j] = 0;
        }
        
        for (current_pos[0]=ng->first_id;
             current_pos[0]<=(int) ng->vocab_size;
             current_pos[0]++) {
            
            if (return_count(ng->four_byte_counts,
                             ng->count_table[0], 
                             ng->marg_counts,
                             ng->marg_counts4,
                             current_pos[0]) > 0) {
                
                current_table = 1;
                
                if (current_pos[0] == (int) ng->vocab_size)
                    end_pos[1] = (int ) ng->num_kgrams[1]-1;
                else {
                    end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1],
                                                ng->ptr_table[0],
                                                ng->ptr_table_size[0],
                                                current_pos[0]+1)-1;
                }
                
                while (current_table > 0) {
                    
                    /*	  fprintf(stderr, "i %d, current_pos[i] %d, end_pos[i] %d\n",
                     i,
                     current_pos[i],
                     end_pos[i]);
                     fflush(stderr);*/
                    
                    
                    if (current_table == i) {
                        
                        if (current_pos[i] <= end_pos[i]) {
                            
                            /*	      fprintf(stderr, "%d\n",ng->count[i][current_pos[i]]);
                             fprintf(stderr, "%d\n",ng->count_table[i][ng->count[i][current_pos[i]]]);*/
                            
                            ngcount = return_count(ng->four_byte_counts,
                                                   ng->count_table[i],
                                                   ng->count[i],
                                                   ng->count4[i],
                                                   current_pos[i]);
                            
                            
                            if (i==1) {
                                marg_count = return_count(ng->four_byte_counts,
                                                          ng->count_table[0], 
                                                          ng->marg_counts,
                                                          ng->marg_counts4,
                                                          current_pos[0]);
                            }else {
                                marg_count = return_count(ng->four_byte_counts,
                                                          ng->count_table[i-1],
                                                          ng->count[i-1],
                                                          ng->count4[i-1],
                                                          current_pos[i-1]);
                            }
                            
                            if(ng->disc_meth==NULL)
                                ng->disc_meth=(disc_meth_t*) disc_meth_init(ng->discounting_method);
                            
                            assert(ng->disc_meth);
                            discounted_ngcount = 
                            NG_DISC_METH(ng)->dump_discounted_ngram_count(ng,i,ngcount,marg_count,current_pos);
                            
                            ngprob = (double) discounted_ngcount / marg_count;
                            
                            if (ngprob > 1.0) {
                                fprintf(stderr,
                                        "discounted_ngcount = %f marg_count = %d %d %d %d\n",
                                        discounted_ngcount,marg_count,current_pos[0],
                                        current_pos[1],current_pos[2]);
                                quit(-1,"Error : probablity of ngram is greater than one.\n");
                            }
                            
                            if (ngprob > 0.0) 
                                log_10_ngprob = log10(ngprob);
                            else 
                                log_10_ngprob = BAD_LOG_PROB;
                            
                            if (i <= ng->n-2) {
                                ngalpha = ng_double_alpha(ng, i, current_pos[i]);
                                
                                if (ngalpha > 0.0)
                                    log_10_ngalpha = log10(ngalpha);
                                else
                                    log_10_ngalpha = BAD_LOG_PROB;
                            }
                            // BEGIN HLW VERSION
                            if(((strstr (ng->vocab[current_pos[0]],"</s>")) == NULL)&&((i <= 1) || ((i > 1) && ((strstr (ng->vocab[(unsigned int) ng->word_id[i][current_pos[i]]],"<s>")) == NULL)))) { // if the overall entry is a trigram and it's going to end with <s>, skip it -- HLW
                                
                                fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob);
                                fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]);
                                for (j=1;j<=i;j++){
                                    
                                    fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]);
                                }
                                
                                if (i <= ng->n-2){
                                    fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha);
                                } else{
                                    fprintf(ng->arpa_fp,"\n");
                                }
                            } else {
                                // something is being skipped  -- HLW
                                if(i==0) {
                                    skipped_unigrams++;
                                } else if(i==1) {
                                    skipped_bigrams++;
                                } else if (i==2) {
                                    skipped_trigrams++;
                                }
                            }
                            // END HLW VERSION
                            
                            // PREVIOUS VERSION:
                            
                            /*
                             if (i <= ng->n-2) {
                             ngalpha = ng_double_alpha(ng, i, current_pos[i]);
                             
                             if (ngalpha > 0.0)
                             log_10_ngalpha = log10(ngalpha);
                             else
                             log_10_ngalpha = BAD_LOG_PROB;
                             }
                             
                             fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob);
                             fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]);
                             for (j=1;j<=i;j++){
                             
                             //		fprintf(stderr, "j %d, ng->wordid[j] %u, current_pos[j] %d, ng->word_id[j][current_pos[j]] %u\n",j, ng->word_id[j], current_pos[j], ng->word_id[j][current_pos[j]]);
                             
                             fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]);
                             }
                             
                             if (i <= ng->n-2)
                             fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha);
                             else
                             fprintf(ng->arpa_fp,"\n");
                             */
                            
                            current_pos[i]++;        
                        }else {
                            current_table--;
                            if (current_table > 0)
                                current_pos[current_table]++;
                        }
                    }else {
                        
                        if (current_pos[current_table] <= end_pos[current_table]) {
                            current_table++;
                            if (current_pos[current_table-1] == (int) ng->num_kgrams[current_table-1]-1)
                                end_pos[current_table] = (int) ng->num_kgrams[current_table]-1;
                            else {
                                end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1],
                                                                        ng->ptr_table[current_table-1],
                                                                        ng->ptr_table_size[current_table-1],
                                                                        current_pos[current_table-1]+1) - 1;
                            }
                        }else {
                            current_table--;
                            if (current_table > 0)
                                current_pos[current_table]++;
                        }
                    }
                }
            }
        }
    } 
    
    free(current_pos);
    free(end_pos);
    
    fprintf(ng->arpa_fp,"\n\\end\\\n");
    
    rr_oclose(ng->arpa_fp);
    
    // BEGIN HLW ADDITION
    
    // Now that the file is complete, let's go back and replace the placeholder ngram counts with the real final counts  -- HLW
    
    final_ngram_count_replacement(ng->n,ng);
    
    unigram_count = 0;
    bigram_count = 0;
    trigram_count = 0;
    skipped_unigrams = 0;
    skipped_bigrams = 0;
    skipped_trigrams = 0;
    
    // END HLW ADDITION
}