Exemple #1
0
void calc_mem_req(ng_t *ng,flag is_ascii) {

  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count;
  int i,j;

  current_ngram.id_array = (id__t *) rr_malloc(sizeof(id__t)*ng->n);
  previous_ngram.id_array = (id__t *) rr_malloc(sizeof(id__t)*ng->n);
  
  ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t));

  current_ngram.n = ng->n;

  rewind(ng->id_gram_fp);

  while (!rr_feof(ng->id_gram_fp)) {
    for (i=0;i<=ng->n-1;i++) {
      previous_ngram.id_array[i]=current_ngram.id_array[i];
    }
    get_ngram(ng->id_gram_fp,&current_ngram,is_ascii);
    for (i=0;i<=ng->n-1;i++) {
      if (current_ngram.id_array[i] != previous_ngram.id_array[i]) {
	for (j=i;j<=ng->n-1;j++) {
	  if (j>0) {
	    if (ng_count[j] > ng->cutoffs[j-1]) {
	      ng->table_sizes[j]++;
	    }
	  }
	  ng_count[j] =  current_ngram.count;
	}
	i=ng->n;
      }
      else {
	ng_count[i] += current_ngram.count;
      }
    }
  }

  for (i=1;i<=ng->n-1;i++) {
    if (ng_count[i] > ng->cutoffs[i-1]) {
      ng->table_sizes[i]++;
    }
  }

  /* Add a fudge factor, as problems can crop up with having to
     cut-off last few n-grams. */

  for (i=1;i<=ng->n-1;i++) {
    ng->table_sizes[i]+=10;
  }

  rr_iclose(ng->id_gram_fp);
  ng->id_gram_fp = rr_iopen(ng->id_gram_filename);

}
Exemple #2
0
void compute_perplexity(ng_t *ng,
			arpa_lm_t *arpa_ng,
			char *text_stream_filename,
			char *probs_stream_filename,
			char *annotation_filename,
			char *oov_filename,
			char *fb_list_filename,
			flag backoff_from_unk_inc,
			flag backoff_from_unk_exc,
			flag backoff_from_ccs_inc,
			flag backoff_from_ccs_exc,
			flag arpa_lm,
			flag include_unks,
			double log_base) {

  fb_info *fb_list;
  FILE *temp_fp;
  FILE *text_stream_fp;
  FILE *probs_stream_fp;
  FILE *annotation_fp;
  FILE *oov_fp;
  flag out_probs;
  flag annotate;
  flag out_oovs;
  flag found_unk_wrongly;
  double prob;
  double sum_log_prob;
  int total_words;
  int excluded_unks;
  int excluded_ccs;
  char current_word[1000];  /* Hope that's big enough */
  char **prev_words;
  vocab_sz_t current_id;
  id__t short_current_id;
  id__t *context;
  int context_length;
  int i;
  int bo_case;
  int actual_context_length;
  int *ngrams_hit;
  int n;

  /* Initialise file pointers to prevent warnings from the compiler. */

  probs_stream_fp = NULL;
  annotation_fp = NULL;
  oov_fp = NULL;

  short_current_id = 0;

  found_unk_wrongly = 0;

  annotate = 0;

  bo_case = 0;

  if (arpa_lm) {
    n = arpa_ng->n;
    fb_list = gen_fb_list(arpa_ng->vocab_ht,
			  (int) arpa_ng->vocab_size,
			  arpa_ng->vocab,
			  arpa_ng->context_cue,
			  backoff_from_unk_inc,
			  backoff_from_unk_exc,
			  backoff_from_ccs_inc,
			  backoff_from_ccs_exc,
			  fb_list_filename);
  }else {
    n = ng->n;
    fb_list = gen_fb_list(ng->vocab_ht,
			  (int) ng->vocab_size,
			  ng->vocab,
			  ng->context_cue,
			  backoff_from_unk_inc,
			  backoff_from_unk_exc,
			  backoff_from_ccs_inc,
			  backoff_from_ccs_exc,
			  fb_list_filename);
  }
  
  ngrams_hit = (int *) rr_calloc(n,sizeof(int));
  prev_words = (char **) rr_malloc(sizeof(char *)*n);
  for (i=0;i<=n-1;i++)
    prev_words[i] = (char *) rr_malloc(sizeof(char)*1000);
  
  /* Check that text_stream_filename and probs_stream_filename (if
     specified) are valid. Note that the checks employed by the
     standard rr_fopen tools are not suitable here, since we don't
     want the program to terminate if the paths are not found. */

  if (!strcmp(text_stream_filename,"")) {
    printf("Error : Must specify a text file. Use the -text switch.\n");
    return;
  }

  if (!rr_fexists(text_stream_filename) && strcmp(text_stream_filename,"-")) {
    printf("Error : Can't open file %s for reading.\n",text_stream_filename);
    return;
  }

  out_probs = strcmp(probs_stream_filename,"");
  annotate = strcmp(annotation_filename,"");
  out_oovs = strcmp(oov_filename,"");

  printf("Computing perplexity of the language model with respect\n");
  printf("   to the text %s\n",text_stream_filename);
  if (out_probs)
    printf("Probability stream will be written to file %s\n",
	    probs_stream_filename);

  if (annotate)
    printf("Annotation will be written to file %s\n",
	    annotation_filename);

  if (out_oovs)
    printf("Out of vocabulary words will be written to file %s\n",
	    oov_filename);

  if (backoff_from_unk_inc)
    printf("Will force inclusive back-off from OOVs.\n");

  if (backoff_from_unk_exc)
    printf("Will force exclusive back-off from OOVs.\n");

  if (backoff_from_ccs_inc)
    printf("Will force inclusive back-off from context cues.\n");

  if (backoff_from_ccs_exc)
    printf("Will force exclusive back-off from context cues.\n");

  if (strcmp(fb_list_filename,"")) 
    printf("Will force back-off according to the contents of %s\n",
	    fb_list_filename);

  if (include_unks)
    printf("Perplexity calculation will include OOVs.\n");

  /* Check for existance of files, as rr functions will quit, which isn't
     what we want */

  if (out_probs && strcmp(probs_stream_filename,"-")) {
    if ((temp_fp = fopen(probs_stream_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",probs_stream_filename);
      return;
    }
    fclose(temp_fp);
  }

  if (annotate && strcmp(annotation_filename,"-")) {
    if ((temp_fp = fopen(annotation_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",annotation_filename);
      return;
    }
    fclose(temp_fp);
  }
    
  if (out_oovs && strcmp(oov_filename,"-")) {
    if ((temp_fp = fopen(oov_filename,"w")) == NULL) {
      printf("Error : Can't open file %s for writing.\n",oov_filename);
      return;
    }
    fclose(temp_fp);
  }

  text_stream_fp = rr_iopen(text_stream_filename);
  if (out_probs)
    probs_stream_fp = rr_oopen(probs_stream_filename);

  if (annotate)
    annotation_fp = rr_oopen(annotation_filename);

  if (out_oovs)
    oov_fp = rr_oopen(oov_filename);

  context = (id__t *) rr_malloc(sizeof(id__t)*(n-1));

  sum_log_prob = 0.0;
  total_words = 0;
  excluded_unks = 0;
  excluded_ccs = 0;

  while (!rr_feof(text_stream_fp)) {

    if (total_words > 0) {
      if (total_words < n)
	strcpy(prev_words[total_words-1],current_word);
      else {
	for (i=0;i<=n-3;i++)
	  strcpy(prev_words[i],prev_words[i+1]);

	if (n>1)
	  strcpy(prev_words[n-2],current_word);
      }
    }

    if (total_words < (n-1))
      context_length = total_words;
    else
      context_length = n-1;

    /* Fill context with right stuff */

    if (total_words > (n-1)) {
      for (i=0;i<=context_length-2;i++)
	context[i] = context[i+1];
    }

    if (context_length != 0)
      context[context_length-1] = short_current_id;

    if (fscanf(text_stream_fp,"%s",current_word) != 1) {
      if (!rr_feof(text_stream_fp)) {
	printf("Error reading text file.\n");
	return;
      }
    }

    if (!rr_feof(text_stream_fp)) {

      if (arpa_lm) {
	sih_lookup(arpa_ng->vocab_ht,current_word,&current_id);
	if (arpa_ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
	  found_unk_wrongly = 1;
	  printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
	}
	if (current_id > arpa_ng->vocab_size)
	  quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
	else
	  short_current_id = current_id;

      }else {
	sih_lookup(ng->vocab_ht,current_word,&current_id);
	if (ng->vocab_type == CLOSED_VOCAB && current_id == 0) {
	  found_unk_wrongly = 1;
	  printf("Error : %s is not in the vocabulary, and this is a closed \nvocabulary model.\n",current_word);
	}
	if (current_id > ng->vocab_size)
	  quit(-1,"Error : returned value from sih_lookup (%d) is too high.\n",context[i]); 
	else
	  short_current_id = current_id;

      }
    
      if (!found_unk_wrongly) {

	if (current_id == 0 && out_oovs)
	  fprintf(oov_fp,"%s\n",current_word);

	if ((arpa_lm && (!(arpa_ng->context_cue[current_id])))
	    || ((!arpa_lm) && (!(ng->context_cue[current_id])))) {

	  if (include_unks || current_id != 0) {

	    prob = calc_prob_of(short_current_id,
				context,
				context_length,
				ng,
				arpa_ng,
				fb_list,
				&bo_case,
				&actual_context_length,
				arpa_lm);


	    if (prob<= 0.0 || prob > 1.0) {
	      fprintf(stderr,"Warning : ");
	      if (short_current_id == 0)
		fprintf(stderr,"P( <UNK> | ");
	      else
		fprintf(stderr,"P( %s | ",current_word);
	  
	      for (i=0;i<=actual_context_length-1;i++) {
		if (context[i+context_length-actual_context_length] == 0)
		  fprintf(stderr,"<UNK> ");
		else
		  fprintf(stderr,"%s ",prev_words[i]);
	      }
	      fprintf(stderr,") = %g logprob = %g \n ",prob,log(prob)/log(log_base));
	      fprintf(stderr,"bo_case == 0x%dx, actual_context_length == %d\n",
		      bo_case, actual_context_length);
	    }
	  
	    if (annotate) {
	      if (short_current_id == 0)
		fprintf(annotation_fp,"P( <UNK> | ");
	      else 
		fprintf(annotation_fp,"P( %s | ",current_word);
	  
	      for (i=0;i<=actual_context_length-1;i++) {
		if (context[i+context_length-actual_context_length] == 0)
		  fprintf(annotation_fp,"<UNK> ");
		else {
		  if (arpa_lm)
		    fprintf(annotation_fp,"%s ",arpa_ng->vocab[context[i+context_length-actual_context_length]]);
		  else
		    fprintf(annotation_fp,"%s ",ng->vocab[context[i+context_length-actual_context_length]]);
		}
	      }
	      fprintf(annotation_fp,") = %g logprob = %f bo_case = ",prob,log(prob)/log(log_base));
	      decode_bo_case(bo_case,actual_context_length,annotation_fp);
	    }

	    /* Calculate level to which we backed off */

	    for (i=actual_context_length-1;i>=0;i--) {
 	      int four_raise_i = 1<<(2*i);  /* PWP */
 
 	      /*
 	       * PWP: This was "if ((bo_case / (int) pow(3,i)) == 0)"
 	       * but was getting a divide-by-zero error on an Alpha
 	       * (it isn't clear to me why it should ever have done so)
 	       * Anyway, it is much faster to do in base-4.
 	       */

	      if ((bo_case == 0) || ((bo_case / four_raise_i) == 0)) {
		ngrams_hit[i+1]++;
		i = -2;
	      }else
		bo_case -= ((bo_case / four_raise_i) * four_raise_i);
	    }
  
	    if (i != -3) 
	      ngrams_hit[0]++;

	    if (out_probs)
	      fprintf(probs_stream_fp,"%g\n",prob);
      
	    sum_log_prob += log10(prob);
			  
	  }

          if (current_id == 0 && !include_unks)
            excluded_unks++;
	}       
	else {
	  if (((!arpa_lm) && ng->context_cue[current_id]) || 
	      (arpa_lm && arpa_ng->context_cue[current_id]))
	    excluded_ccs++;
	}
	total_words++;
      }
    }
  }

  if (!found_unk_wrongly) {      /*  pow(x,y) = e**(y  ln(x)) */
    printf("Perplexity = %.2f, Entropy = %.2f bits\n", 
	    exp(-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
		log(10.0)),
	   (-sum_log_prob/(total_words-excluded_ccs-excluded_unks) * 
	    log(10.0) / log(2.0)));
    printf("Computation based on %d words.\n",
	   total_words-excluded_ccs-excluded_unks);
    for(i=n;i>=1;i--) {
      printf("Number of %d-grams hit = %d  (%.2f%%)\n",i,ngrams_hit[i-1],
	     (float) 100*ngrams_hit[i-1]/(total_words-excluded_ccs-excluded_unks) );
    }
    printf("%d OOVs (%.2f%%) and %d context cues were removed from the calculation.\n",
	   excluded_unks,
	   (float) 100*excluded_unks/(total_words-excluded_ccs),excluded_ccs);
    
  }

  rr_iclose(text_stream_fp);

  if (out_probs)
    rr_oclose(probs_stream_fp);
  if (annotate)
    rr_oclose(annotation_fp);
  if (out_oovs)
    rr_oclose(oov_fp);

  free (fb_list);
  free (context);
  free (ngrams_hit);
}
Exemple #3
0
void main(int argc, char *argv[]) {

  int verbosity;
  int vocab_size;
  int cutoff;
  int num_recs;
  int current_rec;
  int num_above_threshold;
  int num_to_output;
  int i;
  word_rec *records;
  char temp_word[750];
  flag gt_set;
  flag top_set;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help")) {
    fprintf(stderr,"wfreq2vocab : Generate a vocabulary file from a word frequency file.\n");
    fprintf(stderr,"Usage : wfreq2vocab [ -top 20000 | -gt 10]\n");
    fprintf(stderr,"                    [ -records %d ]\n",DEFAULT_MAX_RECORDS);
    fprintf(stderr,"                    [ -verbosity %d]\n",DEFAULT_VERBOSITY);
    fprintf(stderr,"                    < .wfreq > .vocab\n");
    exit(1);
  }

  cutoff = pc_intarg( &argc, argv, "-gt",-1);
  vocab_size = pc_intarg(&argc, argv, "-top",-1);
  num_recs = pc_intarg(&argc, argv, "-records",DEFAULT_MAX_RECORDS);
  verbosity = pc_intarg(&argc, argv, "-verbosity",DEFAULT_VERBOSITY);
  
  pc_report_unk_args(&argc,argv,verbosity);

  if (cutoff != -1) {
    gt_set = 1;
  }
  else {
    gt_set = 0;
    cutoff = 0;
  }

  if (vocab_size != -1) {
    top_set = 1;
  }
  else {
    top_set = 0;
    vocab_size = 0;
  }
  
  if (gt_set && top_set) {
    quit(-1,"wfreq2vocab : Error : Can't use both the -top and the -gt options.\n");
  }



  if (!gt_set && !top_set) {
    vocab_size = 20000;
  }

  if (gt_set) {
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing all words which\n              occurred more that %d times. Reading wfreq stream from stdin...\n",cutoff);
  }
  else {
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing the most\n              frequent %d words. Reading wfreq stream from stdin...\n",vocab_size);
  }


  records = (word_rec *) rr_malloc(sizeof(word_rec)*num_recs);

  current_rec = 0;
  num_above_threshold = 0;
  
  while (!rr_feof(stdin)) {

    if (scanf("%s %d",temp_word,&(records[current_rec].count)) != 2) {
      if (!rr_feof(stdin)) {
	quit(-1,"Error reading unigram counts from standard input.\n");
      }
    }
    else {
      records[current_rec].word = salloc(temp_word);
      if (gt_set && records[current_rec].count > cutoff) {
	num_above_threshold++;
      }
      current_rec++;
    }
  }

  /* Sort records in descending order of count */

  qsort((void*) records,(size_t) current_rec, sizeof(word_rec),sort_by_count);

  if (gt_set) {
    num_to_output = num_above_threshold;
  }
  else {
    num_to_output = vocab_size;
  }

  if (current_rec<num_to_output) {
    num_to_output = current_rec;
  }

  /* Now sort the relevant records alphabetically */

  qsort((void*) records,(size_t) num_to_output, sizeof(word_rec),sort_alpha);

  if (gt_set) {
    pc_message(verbosity,2,"Size of vocabulary = %d\n",num_to_output);
  }
  
  if (num_to_output>65535) {
    pc_message(verbosity,1,"Warning : Vocab size exceeds 65535. This will cause problems with \nother tools, since word id's are stored in 2 bytes.\n");
  }

  /* Print the vocab to stdout */
  
  printf("## Vocab generated by v2 of the CMU-Cambridge Statistcal\n");
  printf("## Language Modeling toolkit.\n");
  printf("##\n");
  printf("## Includes %d words ",num_to_output);
  printf("##\n");

  for (i=0;i<=num_to_output-1;i++) {
    printf("%s\n",records[i].word);
  }

  pc_message(verbosity,0,"wfreq2vocab : Done.\n");

  exit(0);

}  
Exemple #4
0
void main(int argc, char *argv[]) {

  int i,j;

  char *vocab_filename;
  FILE *tempfile;
  char tempfiles_directory[1000];
  int vocab_size;
  FILE *vocab_file;

  int verbosity;

  int buffer_size;
  int position_in_buffer;
  int number_of_tempfiles;
  int max_files;
  int fof_size;

  unsigned short *buffer;
  unsigned short *placeholder;
  unsigned short *temp_ngram;
  int temp_count;
  
  char temp_word[500];
  char temp_word2[500];

  char *temp_file_root;
  char *temp_file_ext;
  char *host_name;
  int proc_id;
  struct utsname uname_info;

  flag write_ascii;

  /* Vocab hash table things */

  struct hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  tempfile = NULL; /* Just to prevent compilation warnings. */

  report_version(&argc,argv);

  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Process command line */
  
  if (pc_flagarg( &argc, argv,"-help") || argc==1) {
    fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n");
    fprintf(stderr,"Usage : text2idngram  -vocab .vocab \n");
    fprintf(stderr,"                    [ -buffer 100 ]\n");
    fprintf(stderr,"                    [ -hash %d ]\n",DEFAULT_HASH_SIZE);
    fprintf(stderr,"                    [ -temp %s ]\n",DEFAULT_TEMP);
    fprintf(stderr,"                    [ -files %d ]\n",DEFAULT_MAX_FILES);
    fprintf(stderr,"                    [ -gzip | -compress ]\n");
    fprintf(stderr,"                    [ -verbosity %d ]\n",
	    DEFAULT_VERBOSITY);
    fprintf(stderr,"                    [ -n 3 ]\n");
    fprintf(stderr,"                    [ -write_ascii ]\n");
    fprintf(stderr,"                    [ -fof_size 10 ]\n");
    exit(1);
  }

  pc_message(verbosity,2,"text2idngram\n");

  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);

  placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);

  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");

  fof_size = pc_intarg(&argc,argv,"-fof_size",10);

  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);

  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  
  if (!strcmp("",vocab_filename)) {
    quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n");
  }
    
  strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp", 
					   DEFAULT_TEMP));

  if (pc_flagarg(&argc,argv,"-compress")) {
    temp_file_ext = salloc(".Z");
  }
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) {
      temp_file_ext = salloc(".gz");
    }
    else {
      temp_file_ext = salloc("");
    }
  }

  uname(&uname_info);

  host_name = salloc(uname_info.nodename);

  proc_id = getpid();

  sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);

  temp_file_root = salloc(temp_word);

  pc_report_unk_args(&argc,argv,verbosity);
  
  /* If the last charactor in the directory name isn't a / then add one. */
  
  if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') {
    strcat(tempfiles_directory,"/");
  }
  
  pc_message(verbosity,2,"Vocab                  : %s\n",vocab_filename);
  pc_message(verbosity,2,"N-gram buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size        : %d\n",hash_size);
  pc_message(verbosity,2,"Temp directory         : %s\n",tempfiles_directory);
  pc_message(verbosity,2,"Max open files         : %d\n",max_files);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  
  pc_message(verbosity,2,"n                      : %d\n",n);

  buffer_size *= (1000000/(sizeof(unsigned short)*n));

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) {
      fprintf(stderr,"======================================================\n");
      fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word);
      fprintf(stderr,"=======================================================\n");
    }
    if (strncmp(temp_word,"#",1)==0) {
      fprintf(stderr,"\n\n===========================================================\n");
      fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
      fprintf(stderr,     ">>> %s <<<\n",temp_word);
      fprintf(stderr,     "         '%s' will be included in the vocabulary.\n",temp_word2);
      fprintf(stderr,     "         (comments must start with '##')\n");
      fprintf(stderr,"===========================================================\n\n");
    }
    vocab_size++;
    add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size);
  }

  if (vocab_size > MAX_VOCAB_SIZE) {
    quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n");
  }   
  
  pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n");

  buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short));

  number_of_tempfiles = 0;

  /* Read text into buffer */

  /* Read in the first ngram */

  position_in_buffer = 0;

  for (i=0;i<=n-1;i++) {
    get_word(stdin,temp_word);
    add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(stdin)) {

    /* Fill up the buffer */

    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
    while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) {
      position_in_buffer++;
      if (position_in_buffer % 20000 == 0) {
	if (position_in_buffer % 1000000 == 0) {
	  pc_message(verbosity,2,".\n");
	}
	else {
	  pc_message(verbosity,2,".");
	}
      }
      for (i=1;i<=n-1;i++) {
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      }
      if (get_word(stdin,temp_word) == 1) {
	add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) {
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);
    }

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");
    
    qsort((void*) buffer,(size_t) position_in_buffer,
	  n*sizeof(unsigned short),compare_ngrams);

    /* Output the buffer to temporary BINARY file */
    
    number_of_tempfiles++;

    sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    tempfile = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
      if (temp_ngram[i] > MAX_VOCAB_SIZE) {
	quit(-1,"Invalid trigram in buffer.\nAborting");

      }
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {
 
      if (!compare_ngrams(temp_ngram,&buffer[i*n])) {
	temp_count++;
      }
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1,
		    tempfile,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite(&temp_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	temp_count = 1;
      }
    }
    
    rr_oclose(tempfile);

    for (i=0;i<=n-1;i++) {
      add_to_buffer(placeholder[i],0,i,buffer);
    }

    position_in_buffer = 0;

  }

  /* Merge the temporary files, and output the result to standard output */

  pc_message(verbosity,2,"Merging temporary files...\n");
  
  merge_tempfiles(1,
		  number_of_tempfiles,
		  temp_file_root,
		  temp_file_ext,
		  max_files,
		  tempfiles_directory,
		  stdout,
		  write_ascii,
		  fof_size); 

  pc_message(verbosity,0,"text2idngram : Done.\n");

  exit(0);
  
}
Exemple #5
0
int oe_03_main (int argc, char **argv) {

  flag first_ngram;
  int n;
  fof_sz_t fof_size;
  flag is_ascii;
  int verbosity;
  fof_t **fof_array;
  ngram_sz_t *num_kgrams;
  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count;
  int pos_of_novelty;
  int nlines;
  int i;

  report_version(&argc,argv);

  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {
    oe_04_help_message();
    exit(1);
  }

  is_ascii = pc_flagarg(&argc, argv,"-ascii_input");
  n = pc_intarg(&argc, argv,"-n",3);
  fof_size = pc_intarg(&argc, argv,"-fof_size",50);
  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);

  pc_report_unk_args(&argc,argv,verbosity);

  pc_message(verbosity,2,"n        = %d\n",n);
  pc_message(verbosity,2,"fof_size = %d\n",fof_size);

  current_ngram.n = n;
  previous_ngram.n = n;
  pos_of_novelty = n;
  
  fof_array = (fof_t **) rr_malloc(sizeof(fof_t *) * (n-1));
  for (i=0;i<=n-2;i++) 
    fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t));

  num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t));
  ng_count = (count_t *) rr_calloc(n-1,sizeof(count_t));

  current_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));
  previous_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));

  pc_message(verbosity,2,"Processing id n-gram file.\n");
  pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

  nlines = 0;
  first_ngram = 1;
  
  while (!rr_feof(stdin)) {
    
    if (!first_ngram)
      ngram_copy(&previous_ngram,&current_ngram,n);

    if (get_ngram(stdin,&current_ngram,is_ascii)) {

      nlines++;
      show_idngram_nlines(nlines, verbosity);
    
      /* Test for where this ngram differs from last - do we have an
	 out-of-order ngram? */
    
      if (!first_ngram)
        pos_of_novelty = ngram_find_pos_of_novelty(&current_ngram,&previous_ngram,n,nlines);
      else
        pos_of_novelty = 0;

      /* Add new N-gram */
     
      num_kgrams[n-2]++;
      if (current_ngram.count <= fof_size) 
	fof_array[n-2][current_ngram.count]++;

      if (!first_ngram) {
	for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
	  num_kgrams[i-1]++;
	  if (ng_count[i-1] <= fof_size) 
	    fof_array[i-1][ng_count[i-1]]++;
	  
	  ng_count[i-1] = current_ngram.count;
	}
      } else {
	for (i=n-2;i>=MAX(1,pos_of_novelty);i--) 
	  ng_count[i-1] = current_ngram.count;
      }
	
      for (i=0;i<=pos_of_novelty-2;i++) 
	ng_count[i] += current_ngram.count;
	
      if (first_ngram)
        first_ngram = 0;
    }
  }

  /* Process last ngram */

  for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
    num_kgrams[i-1]++;
    if (ng_count[i-1] <= fof_size) {
      fof_array[i-1][ng_count[i-1]]++;
    }
    ng_count[i-1] = current_ngram.count;
  }
 #import "OpenEarsStaticAnalysisToggle.h"
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__ 
  for (i=0;i<=pos_of_novelty-2;i++)
    ng_count[i] += current_ngram.count;

  display_fof_array(num_kgrams,fof_array,fof_size,stderr, n);
#endif
  pc_message(verbosity,0,"idngram2stats : Done.\n");

  exit(0);
  
}
Exemple #6
0
int main (int argc, char **argv) {

  int n;
  int verbosity;
  int max_files;
  int max_words;
  int max_chars;

  int current_word;
  int current_char;
  int start_char;		/* start boundary (possibly > than 0) */

  int no_of_spaces;
  int pos_in_string;

  int i;
  char *current_string;
  char current_temp_filename[500];
  int current_file_number;
  FILE *temp_file;

  flag text_buffer_full;

  char *text_buffer;
  char **pointers;

  char current_ngram[500];
  int current_count;

  int counter;

  char temp_directory[1000];
  char *temp_file_ext;

  flag words_set;
  flag chars_set;

  /* Process command line */

  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);
  pc_message(verbosity,2,"text2wngram\n");

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help")) {
    help_message();
    exit(1);
  }

  n = pc_intarg(&argc, argv,"-n",DEFAULT_N);

  /*  max_words = pc_intarg(&argc, argv,"-words",STD_MEM*1000000/11);
  max_chars = pc_intarg(&argc, argv,"-chars",STD_MEM*7000000/11); */

  max_words = pc_intarg(&argc, argv,"-words",-1);
  max_chars = pc_intarg(&argc, argv,"-chars",-1);

  if (max_words == -1) {
    words_set = 0;
    max_words = STD_MEM*1000000/11;
  }else
    words_set = 1;

  if (max_chars == -1) {
    chars_set = 0;
    max_chars = STD_MEM*7000000/11; 
  }else
    chars_set = 1;
  
  max_files = pc_intarg(&argc, argv,"-files",DEFAULT_MAX_FILES);

  if (pc_flagarg(&argc,argv,"-compress"))
    temp_file_ext = salloc(".Z");
  else {
    if (pc_flagarg(&argc,argv,"-gzip"))
      temp_file_ext = salloc(".gz");
    else
      temp_file_ext = salloc("");
  }

  strcpy(temp_directory, "cmuclmtk-XXXXXX");
  if (mkdtemp(temp_directory) == NULL) {
     quit(-1, "Failed to create temporary folder: %s\n", strerror(errno));
  }

  pc_report_unk_args(&argc,argv,verbosity);
 
  if (words_set && !chars_set)
    max_chars = max_words * 7;

  if (!words_set && chars_set)
    max_words = max_chars / 7;

  /* If the last charactor in the directory name isn't a / then add one. */
  
  pc_message(verbosity,2,"n = %d\n",n);
  pc_message(verbosity,2,"Number of words in buffer = %d\n",max_words);
  pc_message(verbosity,2,"Number of chars in buffer = %d\n",max_chars);
  pc_message(verbosity,2,"Max number of files open at once = %d\n",max_files);
  pc_message(verbosity,2,"Temporary directory = %s\n",temp_directory);

  /* Allocate memory for the buffers */

  text_buffer = (char *) rr_malloc(sizeof(char)*max_chars);
  pc_message(verbosity,2,"Allocated %d bytes to text buffer.\n",
	     sizeof(char)*max_chars);

  pointers = (char **) rr_malloc(sizeof(char *)*max_words);
  pc_message(verbosity,2,"Allocated %d bytes to pointer array.\n",
	     sizeof(char *)*max_words);

  current_file_number = 0;

  current_word = 1;
  start_char = 0;
  current_char = 0;
  counter = 0;
  pointers[0] = text_buffer;
      
  while (!feof(stdin)) {

    current_file_number++;

    /* Read text into buffer */
    
    pc_message(verbosity,2,"Reading text into buffer...\n");

    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 words processed for each \".\", 1,000,000 for each line.\n");
    
    pointers[0] = text_buffer;
    
    while ((!rr_feof(stdin)) && 
	   (current_word < max_words) && 
	   (current_char < max_chars)) {

      text_buffer[current_char] = getchar();
      if (text_buffer[current_char] == '\n' || 
	  text_buffer[current_char] == '\t' ) {
	text_buffer[current_char] = ' ';
      }
      if (text_buffer[current_char] == ' ') {
	if (current_char > start_char) {
	  if (text_buffer[current_char-1] == ' ') {
	    current_word--;
	    current_char--;
	  }
	  pointers[current_word] = &(text_buffer[current_char+1]);
	  current_word++; 
	  counter++;
	  if (counter % 20000 == 0) {
	    if (counter % 1000000 == 0)
	      pc_message(verbosity,2,"\n");
	    else
	      pc_message(verbosity,2,".");
	  }
	}
      }
      
      if (text_buffer[current_char] != ' ' || current_char > start_char) 
	current_char++;
    }

    text_buffer[current_char]='\0';


    if (current_word == max_words || rr_feof(stdin)) {
      for (i=current_char+1;i<=max_chars-1;i++)
	text_buffer[i] = ' ';

      text_buffer_full = 0;
    }else
      text_buffer_full = 1;
    
    /* Sort buffer */

    pc_message(verbosity,2,"\nSorting pointer array...\n"); 

    qsort((void *) pointers,(size_t) current_word-n,sizeof(char *),cmp_strings);
   
    /* Write out temporary file */

    sprintf(current_temp_filename,"%s/%hu%s",temp_directory, current_file_number, temp_file_ext);

    pc_message(verbosity,2,"Writing out temporary file %s...\n",current_temp_filename);
        
    temp_file = rr_oopen(current_temp_filename);
    text_buffer[current_char] = ' ';
    
    current_count = 0;
    strcpy(current_ngram,"");
    
    for (i = 0; i <= current_word-n; i++) {
      current_string = pointers[i];
      
      /* Find the nth space */

      no_of_spaces = 0;
      pos_in_string = 0;
      while (no_of_spaces < n) {	
	if (current_string[pos_in_string] == ' ')
	  no_of_spaces++;

	pos_in_string++;
      }
      
      if (!strncmp(current_string,current_ngram,pos_in_string))
	current_count++;
      else {
	if (strcmp(current_ngram,""))
	  if (fprintf(temp_file,"%s %d\n",current_ngram,current_count) < 0) 
	    quit(-1,"Error writing to temporary file %s\n",current_temp_filename);

	current_count = 1;
	strncpy(current_ngram,current_string,pos_in_string);
	current_ngram[pos_in_string] = '\0';
      }
    }
    
    rr_oclose(temp_file);

    /* Move the last n-1 words to the beginning of the buffer, and set
       correct current_word and current_char things */

    strcpy(text_buffer,pointers[current_word-n]);
    pointers[0]=text_buffer;
   
    /* Find the (n-1)th space */

    no_of_spaces=0;
    pos_in_string=0;

    if (!text_buffer_full){ 
      while (no_of_spaces<(n-1)) {
	if (pointers[0][pos_in_string]==' ') {
	  no_of_spaces++;
	  pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
	}
	pos_in_string++;
      }
    }else {
      while (no_of_spaces<n) {
	if (pointers[0][pos_in_string]==' ') {
	  no_of_spaces++;
	  pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
	}
	pos_in_string++;
      }
      pos_in_string--;
    }

    current_char = pos_in_string;
    current_word = n;
    /* mark boundary beyond which counting pass cannot backup */
    start_char = current_char;

  }
  /* Merge temporary files */

  pc_message(verbosity,2,"Merging temporary files...\n");

  merge_tempfiles(1,
		  current_file_number,
		  temp_directory,
		  temp_file_ext,
		  max_files,
		  stdout,
		  n,
		  verbosity); 

  rmdir(temp_directory);
  pc_message(verbosity,0,"text2wngram : Done.\n");
  
  return 0;
}
Exemple #7
0
int main(int argc, char *argv[]) {

  int verbosity;
  int vocab_size;
  FILE *vocab_file;
  int buffer_size;
  flag write_ascii;
  int max_files;
  int number_of_tempfiles;
  char *vocab_filename;
  char *idngram_filename;
  char temp_word[MAX_WORD_LENGTH];
  char temp_word2[MAX_WORD_LENGTH];
  char temp_word3[MAX_WORD_LENGTH];
  flag contains_unks;
  int position_in_buffer;
  FILE *outfile;
  FILE *tempfile;
  FILE *non_unk_fp;
  ngram_rec *buffer;
  flag same_ngram;
  int i;
  int j;
  int fof_size;
  int size_of_rec;

  char temp_directory[1000];
  char *temp_file_ext;

  /* Vocab hash table things */

  struct idngram_hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  wordid_t *current_ngram;
  int current_count;
  wordid_t *sort_ngram;
  int sort_count;
  
  /* Process command line */

  report_version(&argc,argv);
  
  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }


  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);
  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");
  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);
  fof_size = pc_intarg(&argc,argv,"-fof_size",10);
  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  idngram_filename = salloc(pc_stringarg( &argc, argv, "-idngram", "" ));
  
  if (!strcmp("",vocab_filename)) 
    quit(-1,"Error : Must specify a vocabulary file.\n");

  if (!strcmp("",idngram_filename)) 
    quit(-1,"text2idngram : Error : Must specify idngram file.\n");
    
  if (pc_flagarg(&argc,argv,"-compress")) 
    temp_file_ext = salloc(".Z");
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) 
      temp_file_ext = salloc(".gz");
    else 
      temp_file_ext = salloc("");
  }

  strcpy(temp_directory, "cmuclmtk-XXXXXX");
  if (mkdtemp(temp_directory) == NULL) {
     quit(-1, "Failed to create temporary folder: %s\n", strerror(errno));
  }

  pc_report_unk_args(&argc,argv,verbosity);

  outfile = rr_fopen(idngram_filename,"wb");
  
  pc_message(verbosity,2,"Vocab           : %s\n",vocab_filename);
  pc_message(verbosity,2,"Output idngram  : %s\n",idngram_filename);
  pc_message(verbosity,2,"Buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size : %d\n",hash_size);
  pc_message(verbosity,2,"Max open files  : %d\n",max_files);
  pc_message(verbosity,2,"n               : %d\n",n);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  

  size_of_rec = (sizeof(wordid_t) * n) + 16 - (( n* sizeof(wordid_t)) % 16);
  buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec)));
  fprintf(stderr,"buffer size = %d\n",buffer_size);

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_idngram_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for vocabulary order */
    if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) 
      quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n");

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) 
      warn_on_repeated_words(temp_word);

    warn_on_wrong_vocab_comments(temp_word);

    vocab_size++;
    
    add_to_idngram_hashtable(&vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size);
    strcpy(temp_word3,temp_word2);
  }

  if (vocab_size > MAX_VOCAB_SIZE) 
    quit(-1,"Error : Vocabulary size exceeds maximum.\n");
  
  pc_message(verbosity,2,"Allocating memory for the buffer...\n");

  buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec));
  
  for (i=0;i<=buffer_size;i++) 
    buffer[i].word = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Open the "non-OOV" tempfile */

  sprintf(temp_word, "%s/1%s", temp_directory, temp_file_ext);
  
  non_unk_fp = rr_fopen(temp_word,"w");

  pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n",
	     temp_word);
  number_of_tempfiles = 1;

  current_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));
  sort_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Read text into buffer */
  position_in_buffer = 0;

  while (!rr_feof(stdin)) {
    
    for (i=0;i<=n-1;i++) {
      get_word(stdin,temp_word);
      current_ngram[i]=index2(&vocabulary,temp_word);
    }
    if (scanf("%d",&current_count) != 1) 
      if (!rr_feof(stdin)) 
	quit(-1,"Error reading n-gram count from stdin.\n");

    if (!rr_feof(stdin)) {

      contains_unks = 0;
      for (i=0;i<=n-1;i++) {
	if (!current_ngram[i]) 
	  contains_unks = 1;
      }

      if (contains_unks) {
	/* Write to buffer */

	position_in_buffer++;

	if (position_in_buffer >= buffer_size) {

	  /* Sort buffer */
	  pc_message(verbosity,2,
		     "Sorting n-grams which include an OOV word...\n");

	  qsort((void*) buffer,(size_t) position_in_buffer,
		sizeof(ngram_rec),compare_ngrams2);

	  pc_message(verbosity,2,"Done.\n");

	  /* Write buffer to temporary file */

	  number_of_tempfiles++;
	  
	  sprintf(temp_word,"%s/%hu%s", temp_directory,
		  number_of_tempfiles,temp_file_ext);
	  
	  pc_message(verbosity,2,
		     "Writing sorted OOV-counts buffer to temporary file %s\n",
		     temp_word);

	  tempfile = rr_fopen(temp_word,"w");
	  
	  for (i=0;i<=n-1;i++) 
	    sort_ngram[i] = buffer[0].word[i];

	  sort_count = buffer[0].count;

	  for (i=0;i<=position_in_buffer-2;i++) {
	    
	    same_ngram = 1;
	    for (j=n-1;j>=0;j--) {
	      if (buffer[i].word[j] != sort_ngram[j]) {
		same_ngram = 0;
		j = -1;
	      }
	    }

	    if (same_ngram) 
	      sort_count += buffer[i].count;
	    else {
	      for (j=0;j<=n-1;j++) {
		rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
			  tempfile,"temporary n-gram ids");
		sort_ngram[j] = buffer[i].word[j];
	      }
	      rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
			"temporary n-gram counts");
	      sort_count = buffer[i].count;
	    }
	  }	    
	  for (j=0;j<=n-1;j++) 
	    rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		      tempfile,"temporary n-gram ids");

	  rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		    "temporary n-gram counts");
	  rr_oclose(tempfile);
	  position_in_buffer = 1;

	}
	
	for (i=0;i<=n-1;i++) 
	  buffer[position_in_buffer-1].word[i] = current_ngram[i];

	buffer[position_in_buffer-1].count = current_count;

      }else {
	/* Write to temporary file */
	for (i=0;i<=n-1;i++) 
	  rr_fwrite((char*)&current_ngram[i],sizeof(wordid_t),1,
		    non_unk_fp,"temporary n-gram ids");

	rr_fwrite((char*)&current_count,sizeof(int),1,non_unk_fp,
		  "temporary n-gram counts");
      }
    }
  }

  if (position_in_buffer > 0) {

    /* Only do this bit if we have actually seen some OOVs */
    /* Sort final buffer */    
    pc_message(verbosity,2,"Sorting final buffer...\n");

    qsort((void*) buffer,(size_t) position_in_buffer,
	  sizeof(ngram_rec),compare_ngrams2);
    
    /* Write final buffer */
    
    number_of_tempfiles++;
  
    sprintf(temp_word,"%s/%hu%s", temp_directory,
	    number_of_tempfiles,temp_file_ext);
    
    pc_message(verbosity,2,"Writing sorted buffer to temporary file %s\n", temp_word);

    tempfile = rr_fopen(temp_word,"w");
    
    for (i=0;i<=n-1;i++) 
      sort_ngram[i] = buffer[0].word[i];

    sort_count = buffer[0].count;
    
    for (i=1;i<=position_in_buffer-1;i++) {
      
      same_ngram = 1;
      for (j=n-1;j>=0;j--) {
	if (buffer[i].word[j] != sort_ngram[j]) {
	  same_ngram = 0;
	  j = -1;
	}
      }
      
      if (same_ngram) 
	sort_count += buffer[i].count;
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		    tempfile,"temporary n-gram ids");
	  sort_ngram[j] = buffer[i].word[j];
	}
	rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	sort_count = buffer[i].count;
      }
    }	    
    for (j=0;j<=n-1;j++) 
      rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		tempfile,"temporary n-gram ids");

    rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
	      "temporary n-gram counts");
    fclose(tempfile);
    

  }
  

  /* Merge the temporary files, and output the result */
  fclose(non_unk_fp);
  pc_message(verbosity,2,"Merging temporary files...\n");
  merge_idngramfiles(1,
		     number_of_tempfiles,
		     temp_directory,
		     temp_file_ext,
		     max_files,
		     outfile,
		     write_ascii,
		     fof_size,
		     n);

  fclose(outfile);

  rmdir(temp_directory);
  pc_message(verbosity,0,"wngram2idngram : Done.\n");

  return 0;
}
Exemple #8
0
void main(int argc, char *argv[]) {

  int verbosity;
  int n;
  int m;
  int i;
  int input_type;
  int storage_type;
  unsigned short *current_ngram_int;
  unsigned short *previous_ngram_int;
  char **current_ngram_text;
  char **previous_ngram_text;
  int current_count;
  int running_total;
  flag same;
  flag first_one;
  flag got_to_eof;
   
  running_total = 0;

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help") || argc==1) {
    fprintf(stderr,"ngram2mgram - Convert an n-gram file to an m-gram file, where m<n\n");
    fprintf(stderr,"Usage : ngram2mgram   -n N -m M\n");
    fprintf(stderr,"                    [ -binary | -ascii | -words ]\n");
    fprintf(stderr,"                    < .ngram > .mgram\n");
    exit(1);
  }
 
  n = pc_intarg( &argc, argv,"-n",0);
  m = pc_intarg( &argc, argv,"-m",0);
  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
  

  input_type = 0;
  
  if (pc_flagarg( &argc, argv,"-binary")) {
    input_type = BINARY;
  }

  if (pc_flagarg( &argc, argv,"-ascii")) {
    if (input_type != 0) {
      quit(-1,"Error : more than one file format specified.\n");
    }
    input_type = ASCII;
  }

  if (pc_flagarg( &argc, argv,"-words")) {  
    if (input_type != 0) {
      quit(-1,"Error : more than one file format specified.\n");
    }
    input_type = WORDS;
  }    

  if (input_type == 0) {
    pc_message(verbosity,2,"Warning : no input type specified. Defaulting to binary.\n");
    input_type = BINARY;
  }

  if (n == 0) {
    quit(-1,"Must specify a value for n. Use the -n switch.\n");
  }

  if (m == 0) {
    quit(-1,"Must specify a value for m. Use the -m switch.\n");
  }
  
  if (n<=m) {
    quit(-1,"n must be greater than m.\n");
  }

  pc_report_unk_args(&argc,argv,verbosity);

  if (input_type == BINARY || input_type == ASCII) {
    storage_type = NUMERIC;
  }
  else {
    storage_type = ALPHA;
  }

  if (storage_type == NUMERIC) {
    current_ngram_int = (unsigned short *) 
      rr_malloc(n*sizeof(unsigned short));
    previous_ngram_int = (unsigned short *) 
      rr_malloc(n*sizeof(unsigned short));

    /* And to prevent compiler warnings ... */

    current_ngram_text = NULL;
    previous_ngram_text = NULL;
  }
  else {
    current_ngram_text = (char **) rr_malloc(n*sizeof(char *));
    previous_ngram_text = (char **) rr_malloc(n*sizeof(char *));
    for (i=0;i<=n-1;i++) {
      current_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
      previous_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
    }

    /* And to prevent compiler warnings ... */

    current_ngram_int = NULL;
    previous_ngram_int = NULL;

  }

  got_to_eof = 0;
  first_one = 1;

  while (!rr_feof(stdin)) {

    /* Store previous n-gram */

    if (!first_one) {

      if (storage_type == NUMERIC) {
	for (i=0;i<=n-1;i++) {
	  previous_ngram_int[i] = current_ngram_int[i];
	}
      }
      else {
	for (i=0;i<=n-1;i++) {
	  strcpy(previous_ngram_text[i],current_ngram_text[i]);
	}
      }

    }

    /* Read new n-gram */

    switch(input_type) {
    case BINARY:
      for (i=0;i<=n-1;i++) {
	rr_fread(&current_ngram_int[i],sizeof(id__t),1,stdin,
		 "from id_ngrams at stdin",0);
      }
      rr_fread(&current_count,sizeof(count_t),1,stdin,
	       "from id_ngrams file at stdin",0);
      break;
    case ASCII:
      for (i=0;i<=n-1;i++) {
	if (fscanf(stdin,"%hu",&current_ngram_int[i]) != 1) {
	  if (!rr_feof(stdin)) {
	    quit(-1,"Error reading id_ngram.\n");
	  }
	  else {
	    got_to_eof = 1;
	  }
	}
      }
      if (fscanf(stdin,"%d",&current_count) != 1) {
	if (!rr_feof(stdin)) {
	  quit(-1,"Error reading id_ngram.\n");
	}
	else {
	  got_to_eof = 1;
	}
      }
      break;
    case WORDS:
      for (i=0;i<=n-1;i++) {
	if (fscanf(stdin,"%s",current_ngram_text[i]) != 1) {
	  if (!rr_feof(stdin)) {
	    quit(-1,"Error reading id_ngram.\n");
	  }
	  else {
	    got_to_eof = 1;
	  }
	}
      }
      if (fscanf(stdin,"%d",&current_count) != 1) {
	if (!rr_feof(stdin)) {
	  quit(-1,"Error reading id_ngram.\n");
	}
	else {
	  got_to_eof = 1;
	}
      }
      break;
    }

    if (!got_to_eof) {

      /* Check for correct sorting */

      if (!first_one) {

	switch(storage_type) {
	case NUMERIC:
	  for (i=0;i<=n-1;i++) {
	    if (current_ngram_int[i]<previous_ngram_int[i]) {
	      quit(-1,"Error : ngrams not correctly sorted.\n");
	    }
	    else {
	      if (current_ngram_int[i]>previous_ngram_int[i]) {
		i=n;
	      }
	    }
	  }
	  break;
	case ALPHA:
	  for (i=0;i<=n-1;i++) {
	    if (strcmp(current_ngram_text[i],previous_ngram_text[i])<0) {
	      quit(-1,"Error : ngrams not correctly sorted.\n");
	    }
	    else {
	      if (strcmp(current_ngram_text[i],previous_ngram_text[i])>0) {
		i=n;
	      }
	    }
	  }
	  break;
	}
      }

      /* Compare this m-gram with previous m-gram */

      if (!first_one) {

	switch(storage_type) {
	case NUMERIC:
	  same = 1;
	  for (i=0;i<=m-1;i++) {
	    if (current_ngram_int[i] != previous_ngram_int[i]) {
	      same = 0;
	    }
	  }
	  if (same) {
	    running_total += current_count;
	  }
	  else {
	    if (input_type == ASCII) {
	      for (i=0;i<=m-1;i++) {
		printf("%d ",previous_ngram_int[i]);
	      }
	      printf("%d\n",running_total);
	    }
	    else {
	      for (i=0;i<=m-1;i++) {
		rr_fwrite(&previous_ngram_int[i],sizeof(id__t),1,stdout,
			  "to id_ngrams at stdout");
	      }
	      rr_fwrite(&running_total,sizeof(count_t),1,stdout,
			"to id n-grams at stdout");
	    }
	    running_total = current_count;
	  }
	  break;
	case ALPHA:
	  same = 1;
	  for (i=0;i<=m-1;i++) {
	    if (strcmp(current_ngram_text[i],previous_ngram_text[i])) {
	      same = 0;
	    }
	  }
	  if (same) {
	    running_total += current_count;
	  }
	  else {
	    for (i=0;i<=m-1;i++) {
	      printf("%s ",previous_ngram_text[i]);
	    }
	    printf("%d\n",running_total);
	    running_total = current_count;
	  
	  }
	  break;
	}
      
      }
      else {
	running_total = current_count;
      } 
    
      first_one = 0;
    
    }
  }

  /* Write out final m-gram */

  switch(input_type) {
  case BINARY:
    break;
  case ASCII:
    for (i=0;i<=m-1;i++) {
      printf("%d ",previous_ngram_int[i]);
    }
    printf("%d\n",running_total);
    break;
  case WORDS:
    for (i=0;i<=m-1;i++) {
      printf("%s ",previous_ngram_text[i]);
    }
    printf("%d\n",running_total);
    break;
  } 

  pc_message(verbosity,0,"ngram2mgram : Done.\n");

  exit(0);

}	  
Exemple #9
0
void merge_idngramfiles (int start_file, 
		      int end_file, 
		      char *temp_file_root,
		      char *temp_file_ext,
		      int max_files,
		      FILE *outfile,
		      flag write_ascii,
		      int fof_size,
		      int n_order) {
  FILE *new_temp_file;
  char temp_string[1000];
  char *new_temp_filename;
  
  FILE **temp_file;
  char **temp_filename;
  wordid_t **current_ngram;
  wordid_t *smallest_ngram;
  wordid_t *previous_ngram;

  int *current_ngram_count;
  flag *finished;
  flag all_finished;
  int temp_count;
  int i,j;
  flag first_ngram;
  fof_t **fof_array;
  ngram_sz_t *num_kgrams;
  int *ng_count;
  int pos_of_novelty;
  
  n = n_order;
  
  pos_of_novelty = n; /* Simply for warning-free compilation */
  num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t));
  ng_count = (int *) rr_calloc(n-1,sizeof(int));
  first_ngram = 1;
  
  previous_ngram = (wordid_t *) rr_calloc(n,sizeof(wordid_t));
  temp_file = (FILE **) rr_malloc(sizeof(FILE *) * (end_file-start_file+1));
  temp_filename = (char **) rr_malloc(sizeof(char *) * 
				      (end_file-start_file+1));

  /* should change to 2d array*/
  current_ngram = (wordid_t **) rr_malloc(sizeof(wordid_t *) * 
						(end_file-start_file+1));
  for (i=0;i<=end_file-start_file;i++) 
    current_ngram[i] = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  current_ngram_count = (int *) rr_malloc(sizeof(int)*(end_file-start_file+1));

  finished = (flag *) rr_malloc(sizeof(flag)*(end_file-start_file+1));
  smallest_ngram = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  /* should change to 2d array*/
  fof_array = (fof_t **) rr_malloc(sizeof(fof_t *)*(n-1));
  for (i=0;i<=n-2;i++) 
    fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t));

  if (end_file-start_file+1 > max_files) {
    sprintf(temp_string,"%s/%hu%s",temp_file_root,
	    end_file+1,temp_file_ext);
    new_temp_filename = salloc(temp_string);
    new_temp_file = rr_oopen(new_temp_filename);
    merge_tempfiles(start_file,start_file+max_files-1,
		    temp_file_root,temp_file_ext,max_files,
		    new_temp_file,write_ascii,0);
    merge_tempfiles(start_file+max_files,end_file+1,
		    temp_file_root,temp_file_ext,max_files,
		    outfile,write_ascii,0);
  }else {

    /* Open all the temp files for reading */
    for (i=0;i<=end_file-start_file;i++) {
      sprintf(temp_string,"%s/%hu%s",temp_file_root,
	      i+start_file,temp_file_ext);
      temp_filename[i] = salloc(temp_string);
      temp_file[i] = rr_iopen(temp_filename[i]);
    }
    
    /* Now go through the files simultaneously, and write out the appropriate
       ngram counts to the output file. */

    for (i=end_file-start_file;i>=0;i--) {
      finished[i] = 0;
      if (!rr_feof(temp_file[i])) {
	for (j=0;j<=n-1;j++) {
	  rr_fread((char*) &current_ngram[i][j], sizeof(wordid_t),1,
		   temp_file[i],"temporary n-gram ids",0);
	}    
	rr_fread((char*) &current_ngram_count[i], sizeof(int),1,
		 temp_file[i],"temporary n-gram counts",0);
      }
    }
    
    all_finished = 0;

    while (!all_finished) {

      /* Find the smallest current ngram */
      for (i=0;i<=n-1;i++) 
	smallest_ngram[i] = MAX_WORDID;

      for (i=0;i<=end_file-start_file;i++) {
	if (!finished[i]) {
	  if (compare_ngrams3(smallest_ngram,current_ngram[i]) < 0) {
	    for (j=0;j<n;j++)
	      smallest_ngram[j] = current_ngram[i][j];
	  }
	}
      }

#if MAX_VOCAB_SIZE < 65535
      /* This check is well-meaning but completely useless since
	 smallest_ngram[i] by definition cannot contain any value
	 greater than MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */
      for (i=0;i<=n-1;i++) {
	if (smallest_ngram[i] > MAX_VOCAB_SIZE) {
	  quit(-1,"Error : Temporary files corrupted, invalid n-gram found.\n");
	}
      }
#endif
	  
      /* For each of the files that are currently holding this ngram,
	 add its count to the temporary count, and read in a new ngram
	 from the files. */

      temp_count = 0;

      for (i=0;i<=end_file-start_file;i++) {
	if (!finished[i]) {
	  if (compare_ngrams3(smallest_ngram,current_ngram[i]) == 0) {
	    temp_count = temp_count + current_ngram_count[i];
	    if (!rr_feof(temp_file[i])) {
	      for (j=0;j<=n-1;j++) {
		rr_fread((char*) &current_ngram[i][j],sizeof(wordid_t),1,
			 temp_file[i],"temporary n-gram ids",0);
	      }
	      rr_fread((char*)&current_ngram_count[i],sizeof(int),1,
		       temp_file[i],"temporary n-gram count",0);
	    }else {
	      finished[i] = 1;
	      all_finished = 1;
	      for (j=0;j<=end_file-start_file;j++) {
		if (!finished[j]) 
		  all_finished = 0;
	      }
	    }
	  }
	}
      }
      
      if (write_ascii) {
	for (i=0;i<=n-1;i++) {

	  if (fprintf(outfile,"%d ",smallest_ngram[i]) < 0) 
	    {
	      quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
	    }
	}
	if (fprintf(outfile,"%d\n",temp_count) < 0)  
	  quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");

      }else {
	for (i=0;i<=n-1;i++) {
	  rr_fwrite((char*)&smallest_ngram[i],sizeof(wordid_t),1,
		    outfile,"n-gram ids");
	}
	rr_fwrite((char*)&temp_count,sizeof(count_t),1,outfile,"n-gram counts");		   
      }

      if (fof_size > 0 && n>1) { /* Add stuff to fof arrays */
	
	/* Code from idngram2stats */	
	pos_of_novelty = n;
	for (i=0;i<=n-1;i++) {
	  if (smallest_ngram[i] > previous_ngram[i]) {
	    pos_of_novelty = i;
	    i=n;
	  }
	}
	  
	/* Add new N-gram */
	  
	num_kgrams[n-2]++;
	if (temp_count <= fof_size)
	  fof_array[n-2][temp_count]++;

	if (!first_ngram) {
	  for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
	    num_kgrams[i-1]++;
	    if (ng_count[i-1] <= fof_size) {
	      fof_array[i-1][ng_count[i-1]]++;
	    }
	    ng_count[i-1] = temp_count;
	  }
	}else {
	  for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
	    ng_count[i-1] = temp_count;
	  }
	  first_ngram = 0;
	}
	  
	for (i=0;i<=pos_of_novelty-2;i++)
	  ng_count[i] += temp_count;

	for (i=0;i<=n-1;i++)
	  previous_ngram[i]=smallest_ngram[i];

      }
    }
    
    for (i=0;i<=end_file-start_file;i++) {
      fclose(temp_file[i]);
      remove(temp_filename[i]); 
    }
    
  }    

  if (fof_size > 0 && n>1) { /* Display fof arrays */

    /* Process last ngram */
    for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
      num_kgrams[i-1]++;
      if (ng_count[i-1] <= fof_size)
	fof_array[i-1][ng_count[i-1]]++;

      ng_count[i-1] = temp_count;
    }
    
    for (i=0;i<=pos_of_novelty-2;i++)
      ng_count[i] += temp_count;

    display_fof_array(num_kgrams,fof_array,fof_size,stderr, n);

  }

}
Exemple #10
0
void merge_tempfiles (int start_file, 
		      int end_file, 
		      char *temp_file_root,
		      char *temp_file_ext,
		      int max_files,
		      FILE *outfile,
		      int n,
		      int verbosity) {

  FILE *new_temp_file;
  char *new_temp_filename;
  
  FILE **temp_file;
  char **temp_filename;
  char **current_ngram;
  char smallest_ngram[1000];
  int *current_ngram_count;
  flag *finished;
  flag all_finished;
  int temp_count;
  char temp_word[500];
  int i,j;
  
  pc_message(verbosity,2,"Merging temp files %d through %d...\n", start_file,
 	  end_file);
   /*
    * If we try to do more than max_files, then merge into groups,
    * then merge groups recursively.
    */
    if (end_file-start_file+1 > max_files) {
       int new_start_file, new_end_file;
       int n_file_groups = 1 + (end_file-start_file)/max_files;
 
       fprintf(stderr, "%d files to do, in %d groups\n", end_file-start_file,
 	      n_file_groups);
 
       new_temp_filename = (char *) rr_malloc(300*sizeof(char));
 
       /*
        * These n_file_groups sets of files will be done in groups of
        * max_files batches each, as temp files numbered
        * end_file+1 ... end_file+n_file_groups,
        * and then these will be merged into the final result.
        */
 
       for (i = 0; i < n_file_groups; i++) {
 	  /* do files i*max_files through min((i+1)*max_files-1,end_file); */
 	  new_start_file = start_file + (i*max_files);
 	  new_end_file = start_file + ((i+1)*max_files) - 1;
 	  if (new_end_file > end_file) new_end_file = end_file;
 	  
 	  sprintf(new_temp_filename,
 		  "%s/%hu%s",
 		  temp_file_root,
 		  end_file+i+1,
 		  temp_file_ext);
 
 	  new_temp_file = rr_oopen(new_temp_filename);
 
 	  merge_tempfiles(new_start_file,
 			  new_end_file,
 			  temp_file_root,
			  temp_file_ext,
 			  max_files,
 			  new_temp_file,
 			  n,
			  verbosity);
 
 	  rr_iclose(new_temp_file);
 
       }
 
       merge_tempfiles(end_file+1,
		       end_file+n_file_groups,
		       temp_file_root,
		       temp_file_ext,
		       max_files,
		       outfile,
		       n,
		       verbosity);
 
       return;
    }
    
   /*
    * We know we are now doing <= max_files.
    */
 
   temp_file = (FILE **) rr_malloc((end_file+1)*sizeof(FILE *));
   temp_filename = (char **) rr_malloc((end_file+1)*sizeof(char *));
   for (i=start_file;i<=end_file;i++) {
     temp_filename[i] = (char *) rr_malloc(300*sizeof(char));
   }
   current_ngram = (char **) rr_malloc((end_file+1)*sizeof(char *));
   for (i=start_file;i<=end_file;i++) {
     current_ngram[i] = (char *) rr_malloc(1000*sizeof(char));
   }
   current_ngram_count = (int *) rr_malloc((end_file+1)*sizeof(int));
   finished = (flag *) rr_malloc(sizeof(flag)*(end_file+1));
  
   /* Open all the temp files for reading */
   for (i=start_file;i<=end_file;i++) {
     sprintf(temp_filename[i],"%s/%hu%s",
	     temp_file_root,i,temp_file_ext);
     temp_file[i] = rr_iopen(temp_filename[i]);
   }
 
   /* Now go through the files simultaneously, and write out the appropriate
      ngram counts to the output file. */
 
   for (i=start_file;i<=end_file;i++) {
     finished[i] = 0;
     if (!rr_feof(temp_file[i])) {
       for (j=0;j<=n-1;j++) {
 	if (fscanf(temp_file[i],"%s",temp_word) != 1) {
 	  if (!rr_feof(temp_file[i]))
 	    quit(-1,"Error reading temp file %s\n",temp_filename[i]);
 	}else {
 	  if (j==0)
 	    strcpy(current_ngram[i],temp_word);
  	  else {
 	    strcat(current_ngram[i]," ");
 	    strcat(current_ngram[i],temp_word);
  	  }
  	}
       }
       if (fscanf(temp_file[i],"%d",&current_ngram_count[i]) != 1) {
	 if (!rr_feof(temp_file[i]))
	   quit(-1,"Error reading temp file %s\n",temp_filename[i]);
       }
     }
   }
   
   all_finished = 0;
   
   while (!all_finished) {
  
     /* Find the smallest current ngram */
 
     strcpy(smallest_ngram,"");
 
     for (i=start_file;i<=end_file;i++) {
       if (!finished[i]) {
	 if (strcmp(smallest_ngram,current_ngram[i]) > 0 ||
	     (smallest_ngram[0] == '\0'))
	   strcpy(smallest_ngram,current_ngram[i]);
       }
     }
     
     /* For each of the files that are currently holding this ngram,
        add its count to the temporary count, and read in a new ngram
        from the files. */
  
     temp_count = 0;
 
     for (i=start_file;i<=end_file;i++) {
       if (!finished[i]) {
 	if (!strcmp(smallest_ngram,current_ngram[i])) {
 	  temp_count += current_ngram_count[i];
 	  if (!rr_feof(temp_file[i])) {
 	    for (j=0;j<=n-1;j++) {
 	      if (fscanf(temp_file[i],"%s",temp_word) != 1) {
 		if (!rr_feof(temp_file[i])) {
 		  quit(-1,"Error reading temp file %s\n",temp_filename[i]);
 		}
 	      }else {
 		if (j==0)
 		  strcpy(current_ngram[i],temp_word);
  		else {
 		  strcat(current_ngram[i]," ");
 		  strcat(current_ngram[i],temp_word);
  		}
  	      }
 	    }
 	    if (fscanf(temp_file[i],"%d",&current_ngram_count[i]) != 1) {
 	      if (!rr_feof(temp_file[i])) {
 		quit(-1,"Error reading temp file count %s\n",
 		     temp_filename[i]);
  	      }
  	    }
 	  }
 
 	  /*
 	   * PWP: Note that the fscanf may have changed the state of
 	   * temp_file[i], so we re-ask the question rather than just
 	   * doing an "else".
 	   */
 	  if (rr_feof(temp_file[i])) {
 	    finished[i] = 1;
 	    all_finished = 1;
 	    for (j=start_file;j<=end_file;j++) {
 	      if (!finished[j]) {
 		all_finished = 0;
  	      }
  	    }
  	  }
  	}
        }
      }
 
     /*
      * PWP: We cannot conditionalize this on (!all_finished) because
      * if we do we may have lost the very last count.  (Consider the
      * case when several files have ran out of data, but the last
      * couple have the last count in them.)
      */
     if (fprintf(outfile,"%s %d\n",smallest_ngram,temp_count) < 0) {
       quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
     }
   }
 
   for (i=start_file;i<=end_file;i++) {
     rr_iclose(temp_file[i]);
     remove(temp_filename[i]);
   }
    
   free(temp_file);
   for (i=start_file;i<=end_file;i++) {
      free(temp_filename[i]);
    }
   free(temp_filename);  

   for (i=start_file;i<=end_file;i++) {
      free(current_ngram[i]);
   }
   free(current_ngram);

  free(current_ngram_count);
  free(finished);
}
Exemple #11
0
/*
  @return number_of_tempfiles
 */
int  read_txt2ngram_buffer(FILE* infp, 
			   struct idngram_hash_table *vocabulary, 
			   int32 verbosity,
			   wordid_t *buffer,
			   int buffer_size,
			   unsigned int n,
			   char* temp_file_root,
			   char* temp_file_ext,
			   FILE* temp_file
			   )
{
  /* Read text into buffer */
  char temp_word[MAX_WORD_LENGTH];
  int position_in_buffer;
  int number_of_tempfiles;
  unsigned int i,j;
  wordid_t *placeholder;
  wordid_t *temp_ngram;
  int temp_count;

#if 1
  int tmpval;
#endif

  temp_ngram  = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);
  placeholder = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  ng=n;

  position_in_buffer = 0;
  number_of_tempfiles = 0;

  //tk: looks like things may croak if the corpus has less than n words
  //not that such a corpus would be useful anyway
  for (i=0;i<=n-1;i++) {
    get_word(infp,temp_word);
    /*
        fprintf(stderr,"%s \n",temp_word);
	fprintf(stderr,"%d \n",index2(vocabulary,temp_word));
        fflush(stderr);
    */
    add_to_buffer(index2(vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(infp)) {
    /* Fill up the buffer */
    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

    while ((position_in_buffer<buffer_size) && (!rr_feof(infp))) {
      position_in_buffer++;
      show_idngram_nlines(position_in_buffer,verbosity);

      for (i=1;i<=n-1;i++) 
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      
      if (get_word(infp,temp_word) == 1) {
      /*
	fprintf(stderr,"%s \n",temp_word);
	fprintf(stderr,"%d \n",index2(vocabulary,temp_word));
	fflush(stderr);
      */
	add_to_buffer(index2(vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) 
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");    
    
    qsort((void*) buffer,(size_t) position_in_buffer,n*sizeof(wordid_t),compare_ngrams);

    /* Output the buffer to temporary BINARY file */    
    number_of_tempfiles++;

    sprintf(temp_word,"%s/%hu%s",temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    temp_file = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
#if MAX_VOCAB_SIZE < 65535
      /* This check is well-meaning but completely useless since
	 buffer_contents() can never return something greater than
	 MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */
      if (temp_ngram[i] > MAX_VOCAB_SIZE)
	quit(-1,"Invalid trigram in buffer.\nAborting");
#endif
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {

      tmpval=compare_ngrams(temp_ngram,&buffer[i*n]);

      /*      for(k=0;k<=n-1;k++){
	fprintf(stderr, "tmpval: %d k %d, temp_ngram %d, &buffer[i*n] %d\n",tmpval, k, temp_ngram[k], (&buffer[i*n])[k]);
	}*/

      if (!compare_ngrams(temp_ngram,&buffer[i*n])) 
	temp_count++;
      else {
	/*	printf("Have been here?\n");*/
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*) &temp_ngram[j],sizeof(wordid_t),1,
		    temp_file,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite((char*)&temp_count,sizeof(int),1,temp_file,
		  "temporary n-gram counts");

	/*	for(j=0 ; j<=n-1;j++)
	  fprintf(stderr,"%d ",temp_ngram[j]);
	  fprintf(stderr,"%d\n",temp_count);*/

	temp_count = 1;
      }
    }
    
    rr_oclose(temp_file);

    for (i=0;i<=n-1;i++) 
      add_to_buffer(placeholder[i],0,i,buffer);

    position_in_buffer = 0;

  }

  return number_of_tempfiles;
}
Exemple #12
0
/* To make this function less dependent on input stream, just pull records out and create an interface for it
 */
int wfreq2vocab_impl(FILE* ifp, FILE* ofp, int cutoff, int vocab_size, int num_recs, int verbosity)
{
  flag gt_set;
  flag top_set;
  int current_rec;
  int num_above_threshold;
  int num_to_output;
  int i;
  word_rec *records;
  char temp_word[750];

  gt_set = (cutoff != -1);
  top_set = (vocab_size != -1);
  if(cutoff==-1) cutoff=0;
  if(vocab_size==-1) vocab_size=0;

  if (gt_set && top_set) 
    quit(-1,"wfreq2vocab : Error : Can't use both the -top and the -gt options.\n");

  if (!gt_set && !top_set) 
    vocab_size = 20000;

  if (gt_set) 
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing all words which\n              occurred more that %d times. Reading wfreq stream from stdin...\n",cutoff);
  else 
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing the most\n              frequent %d words. Reading wfreq stream from stdin...\n",vocab_size);

  current_rec = 0;
  num_above_threshold = 0;

  records = (word_rec *) rr_malloc(sizeof(word_rec)*num_recs);

  while (!rr_feof(ifp)) {
    if (fscanf(ifp, "%s %d",temp_word,&(records[current_rec].count)) != 2) {
      if (!rr_feof(ifp)) 
	quit(-1,"Error reading unigram counts from standard input.\n");

    }else {
      records[current_rec].word = salloc(temp_word);
      if (gt_set && records[current_rec].count > cutoff) 
	num_above_threshold++;

      current_rec++;
    }

    if(current_rec > num_recs ){
      quit2(-1,"The number of records %d reach the user-defined limit %d, consider to increase the number of records by -records\n",current_rec,num_recs);
    }
  }

  /* Sort records in descending order of count */

  qsort((void*) records,(size_t) current_rec, sizeof(word_rec),sort_by_count);

  if (gt_set) 
    num_to_output = num_above_threshold;
  else 
    num_to_output = vocab_size;

  if (current_rec<num_to_output) 
    num_to_output = current_rec;

  /* Now sort the relevant records alphabetically */

  qsort((void*) records,(size_t) num_to_output, sizeof(word_rec),sort_alpha);

  if (gt_set) 
    pc_message(verbosity,2,"Size of vocabulary = %d\n",num_to_output);

  if (num_to_output>MAX_UNIGRAM) {
    pc_message(verbosity,1,"Warning : Vocab size exceeds %d. This might cause problems with \n",MAX_UNIGRAM);
    pc_message(verbosity,1,"other tools, since word id's are stored in 2 bytes.\n");
  }

  if (num_to_output == 0) 
    pc_message(verbosity,1,"Warning : Vocab size = 0.\n");
  /* Print the vocab to stdout */
  
  printf("## Vocab generated by v2 of the CMU-Cambridge Statistcal\n");
  printf("## Language Modeling toolkit.\n");
  printf("##\n");
  printf("## Includes %d words ",num_to_output);
  printf("##\n");

  for (i=0;i<=num_to_output-1;i++) 
    fprintf(ofp,"%s\n",records[i].word);

  pc_message(verbosity,0,"wfreq2vocab : Done.\n");

  return 0;
}
Exemple #13
0
int main(int argc, char **argv) {

  int i,j;
  ng_t* ng;
  int verbosity;
  int mem_alloc_method; /* Method used to decide how much memory to 
			   allocate for count tables */
  int buffer_size;
  flag is_ascii;
  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count; /* Array indicating the number of occurrances of 
			   the current 1-gram, 2-gram, ... ,n-gram 
			   Size depends on #define in general.h
			*/  
  int nlines;
  int pos_of_novelty;
  int prev_id1;
  flag contains_unks;
  int mem_alloced;

  flag displayed_oov_warning; /** Display OOV warning 
			       */

  /*  ------------------  Process command line --------------------- */

  report_version(&argc,argv);

  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }

  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Initialization */
  {
    ng=init_ng(
	    &argc,
	    argv,
	    verbosity
	    );
    
    mem_alloc_method = init_alloc_method(ng, &argc, argv, &buffer_size);
    
    if (!strcmp(ng->id_gram_filename,"-") && mem_alloc_method == TWO_PASSES)
      quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n");
    
    is_ascii = set_lmformat(pc_flagarg(&argc,argv,"-ascii_input"),
			    pc_flagarg(&argc,argv,"-bin_input"),
			    ng);  

    /* Report parameters */
    report_param(verbosity,ng,
		 is_ascii, mem_alloc_method, buffer_size);

    pc_report_unk_args(&argc,argv,verbosity);

  }

  /* --------------- Read in the vocabulary -------------- */
  read_vocab(ng,verbosity);
       		     
  /* --------------- Allocate space for the table_size array --------- */
  init_ng_table_size(ng, 
		     mem_alloc_method,
		     is_ascii,
		     verbosity,
		     buffer_size
		     );

  /* ----------- Allocate memory for tree structure -------------- */

  ng->count = NULL;
  ng->count4 = NULL;
  ng->marg_counts = NULL;
  ng->marg_counts4 = NULL;
  ng->count_table = NULL;

  ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n);
  ng->count4 = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);    
  ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);

  if (ng->four_byte_counts) {
    ng->marg_counts4 = (count_t *) rr_calloc(sizeof(count_t), ng->table_sizes[0]);

  }else {
    for (i=0;i<=ng->n-1;i++) 
      ng->count_table[i] = (count_t *) rr_calloc(ng->count_table_size+1,
						sizeof(count_t));

    ng->marg_counts = (count_ind_t *) rr_calloc(sizeof(count_ind_t),ng->table_sizes[0]);
    fprintf(stderr, "table_size %d\n",ng->table_sizes[0]);
    fflush(stderr);
  }

  ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n);

  if (ng->four_byte_alphas) {
    ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n);
    ng->bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*
						ng->table_sizes[0]);
  }else {
    ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n);
    ng->bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*
						ng->table_sizes[0]);
  }

  ng->ind = (index__t **)  rr_malloc(sizeof(index__t *)*ng->n);

  /* First table */
  if (ng->four_byte_counts) 
    ng->count4[0] = (count_t *) rr_calloc(ng->table_sizes[0],sizeof(count_t));
  else 
    ng->count[0] = (count_ind_t *) rr_calloc(ng->table_sizes[0],sizeof(count_ind_t));

  ng->uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					   ng->table_sizes[0]);
  ng->uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					       ng->table_sizes[0]);

  if (ng->n >=2) 
    ng->ind[0] = (index__t *) rr_calloc(ng->table_sizes[0],sizeof(index__t));

  for (i=1;i<=ng->n-2;i++) {    
    ng->word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[i]);

    if (ng->four_byte_counts) 
      ng->count4[i] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[i]);
    else 
      ng->count[i] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[i]);

    if (ng->four_byte_alphas) 
      ng->bo_weight4[i] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*ng->table_sizes[i]);
    else 
      ng->bo_weight[i] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*ng->table_sizes[i]);
    
    ng->ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng->table_sizes[i]);

    mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) + 
		sizeof(index__t) + sizeof(id__t);
    
    if (ng->four_byte_alphas) 
      mem_alloced += 4;
   
    mem_alloced *= ng->table_sizes[i];
    
    pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
	       mem_alloced,i+1);
    
  }

  ng->word_id[ng->n-1] = (id__t *) 
    rr_malloc(sizeof(id__t)*ng->table_sizes[ng->n-1]);

  if (ng->four_byte_counts) 
    ng->count4[ng->n-1] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[ng->n-1]);    
  else 
    ng->count[ng->n-1] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[ng->n-1]);

  pc_message(verbosity,2,"Allocated (%d+%d) bytes to table for %d-grams.\n",
	     ng->four_byte_counts?sizeof(count_t):sizeof(count_ind_t),
	     sizeof(id__t)*ng->table_sizes[ng->n-1],ng->n);
  
  /* Allocate memory for table for first-byte of indices */

  ng_allocate_ptr_table(ng,NULL,0);

  /* Allocate memory for alpha array */

  ng->alpha_array = (double *) rr_malloc(sizeof(double)*ng->out_of_range_alphas);
  ng->size_of_alpha_array = 0;

  /* Allocate memory for frequency of frequency information */

  ng->freq_of_freq = (fof_t **) rr_malloc(sizeof(fof_t *)*ng->n);

  NG_DISC_METH(ng)->allocate_freq_of_freq(ng);

  /* Read n-grams into the tree */
  pc_message(verbosity,2,"Processing id n-gram file.\n");
  pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

  /* Allocate space for ngrams id arrays */

  current_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  previous_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  current_ngram.n = ng->n;
  previous_ngram.n = ng->n;
  
  ng->num_kgrams = (ngram_sz_t *) rr_calloc(ng->n,sizeof(ngram_sz_t));
  ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t));
  nlines = 1;
  ng->n_unigrams = 0;

  /* Process first n-gram */  
  get_ngram(ng->id_gram_fp,&current_ngram,is_ascii);
  contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);

  /* Skip over any unknown words.  They will come first, because <UNK>
     always has a word ID of zero. */
  while (ng->vocab_type == CLOSED_VOCAB && contains_unks){
    /* Stop looking if there are no more N-Grams.  Of course, this
       means training will fail, since there are no unigrams. */
    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii) == 0)
      break;
    contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);
  }

  for (i=0;i<=ng->n-2;i++) {
    ng->ind[i][0] = new_index(0,ng->ptr_table[i],&(ng->ptr_table_size[i]),0);
    ng->word_id[i+1][0] = current_ngram.id_array[i+1];
    ng->num_kgrams[i+1]++;
    ng_count[i] = current_ngram.count;
  }

  ng_count[0] = current_ngram.count;

  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);

  store_normal_count(ng,0,current_ngram.count,ng->n-1);

  if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
    ng->num_kgrams[ng->n-1]--;

  ngram_copy(&previous_ngram,&current_ngram,ng->n);

  prev_id1 = current_ngram.id_array[0];
    
  displayed_oov_warning = 0;

  while (!rr_feof(ng->id_gram_fp)) {

    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii)) {

      if (ng->vocab_type == CLOSED_VOCAB)
	contains_unks=ngram_chk_contains_unks(&current_ngram,ng->n);
    
      if (!contains_unks || ng->vocab_type != CLOSED_VOCAB) {

	/* Test for where this ngram differs from last - do we have an
	   out-of-order ngram? */
	pos_of_novelty = ngram_find_pos_of_novelty(&current_ngram,&previous_ngram,ng->n,nlines);
    
	nlines++; 
	show_idngram_nlines(nlines, verbosity);
    
	/* Add new n-gram as soon as it is encountered */
	/* If all of the positions 2,3,...,n of the n-gram are context
	   cues then ignore the n-gram. */
    
	if (ng->n > 1) {
	  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);
	        
	  store_normal_count(ng,ng->num_kgrams[ng->n-1],current_ngram.count,ng->n-1);
	  
	  ng->word_id[ng->n-1][ng->num_kgrams[ng->n-1]] = current_ngram.id_array[ng->n-1];
	  ng->num_kgrams[ng->n-1]++;	  
	  
	  if (ng->num_kgrams[ng->n-1] >= ng->table_sizes[ng->n-1])
	    quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[ng->n-1],ng->n);
	}
	/* Deal with new 2,3,...,(n-1)-grams */
      
	for (i=ng->n-2;i>=MAX(1,pos_of_novelty);i--) {

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);
	  
	  if (ng_count[i] <= ng->cutoffs[i-1]) 
	    ng->num_kgrams[i]--;
	  else
	    store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);

	  ng_count[i] = current_ngram.count;
	  ng->word_id[i][ng->num_kgrams[i]] = current_ngram.id_array[i];
	  ng->ind[i][ng->num_kgrams[i]] = new_index(ng->num_kgrams[i+1]-1,
						    ng->ptr_table[i],
						    &(ng->ptr_table_size[i]),
						    ng->num_kgrams[i]);
	  ng->num_kgrams[i]++;
	
	  if (ng->num_kgrams[i] >= ng->table_sizes[i])
	    quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[i],i+1);	  
	}
      
	for (i=0;i<=pos_of_novelty-1;i++) 
	  ng_count[i] += current_ngram.count;
      
	/* Deal with new 1-grams */
      
	if (pos_of_novelty == 0) {
	  if (ng->n>1) {
	    for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) {
	      ng->ind[0][i] = new_index(ng->num_kgrams[1]-1,
				       ng->ptr_table[0],
				       &(ng->ptr_table_size[0]),
				       i);
	    }
	    prev_id1 = current_ngram.id_array[0];
	  }

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

	  if (!ng->context_cue[previous_ngram.id_array[0]]) {
	    ng->n_unigrams += ng_count[0];
	    store_normal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
	  }

	  store_marginal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
		      
	  ng_count[0] = current_ngram.count;
	}

	if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
	  ng->num_kgrams[ng->n-1]--;

	ngram_copy(&previous_ngram,&current_ngram,ng->n);

      }else {
	if (!displayed_oov_warning){
	  pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n");
	  displayed_oov_warning = 1;
	}
      }
    }
  }

  rr_iclose(ng->id_gram_fp);

  for (i=ng->n-2;i>=1;i--) {

    NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);

    if (ng_count[i] <= ng->cutoffs[i-1]) 
      ng->num_kgrams[i]--;
    else 
      store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);
      
  }
  
  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

  if (!ng->context_cue[current_ngram.id_array[0]]) {
    ng->n_unigrams += ng_count[0];
    store_normal_count(ng,current_ngram.id_array[0],ng_count[0],0);
  }

  store_marginal_count(ng,current_ngram.id_array[0],ng_count[0],0);

  if (ng->n>1) {
    for (i=current_ngram.id_array[0]+1;i<=ng->vocab_size;i++)
      ng->ind[0][i] = new_index(ng->num_kgrams[1],
				ng->ptr_table[0],
				&(ng->ptr_table_size[0]),
				current_ngram.id_array[0]);
  }

  /* The idngram reading is completed at this point */
  pc_message(verbosity,2,"\n");

  /* Impose a minimum unigram count, if required */

  if (ng->min_unicount > 0) {

    int nchanged= 0;

    for (i=ng->first_id;i<=ng->vocab_size;i++) {
      if ((return_count(ng->four_byte_counts,
			ng->count_table[0],
			ng->count[0],
			ng->count4[0],
			i) < ng->min_unicount) && !ng->context_cue[i]) {

	/* There was a bug in V2's switch.  Look at segment for ABSOLUTE */
	NG_DISC_METH(ng)->reduce_ug_freq_of_freq(ng,i);
	ng->n_unigrams += (ng->min_unicount - ng->count[0][i]);
	store_normal_count(ng,i,ng->min_unicount,0);
	nchanged++;
      }
    }

    if (nchanged > 0) 
      pc_message(verbosity,2,
		 "Unigram counts of %d words were bumped up to %d.\n",
		 nchanged,ng->min_unicount);
  }

  /* Count zeroton information for unigrams */

  ng->freq_of_freq[0][0] = 0;
  
  for (i=ng->first_id;i<=ng->vocab_size;i++) {
    if (return_count(ng->four_byte_counts,
		     ng->count_table[0],
		     ng->count[0],
		     ng->count4[0],
		     i) == 0) {
      ng->freq_of_freq[0][0]++;
    }
  }  

  if (ng->discounting_method == GOOD_TURING) {
    for (i=0;i<=ng->n-1;i++) 
      for (j=1;j<=ng->fof_size[i];j++) 
	pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng->freq_of_freq[i][j]);
  }

  pc_message(verbosity,2,"Calculating discounted counts.\n");

  NG_DISC_METH(ng)->compute_discount_aux(ng, verbosity);
     
  /* Smooth unigram distribution, to give some mass to zerotons */     
  compute_unigram(ng,verbosity);

  /* Increment Contexts if using Good-Turing discounting-> No need otherwise,
     since all values are discounted anyway. */

  if (ng->discounting_method == GOOD_TURING) {
    pc_message(verbosity,2,"Incrementing contexts...\n");  

    for (i=ng->n-1;i>=1;i--) 
      increment_context(ng,i,verbosity);      
  }

  /* Calculate back-off weights */

  pc_message(verbosity,2,"Calculating back-off weights...\n");

  for (i=1;i<=ng->n-1;i++) 
    compute_back_off(ng,i,verbosity);

  if (!ng->four_byte_alphas) 
    pc_message(verbosity,3,"Number of out of range alphas = %d\n",
	       ng->size_of_alpha_array);

  /* Write out LM */

  pc_message(verbosity,2,"Writing out language model...\n");

  if (ng->write_arpa)
    write_arpa_lm(ng,verbosity);

  if (ng->write_bin) 
    write_bin_lm(ng,verbosity);

  pc_message(verbosity,0,"idngram2lm : Done.\n");

  return 0;    
}