C++ (Cpp) pc_report_unk_args Exemples

Exemple #1

0

Afficher le fichier

Fichier : wfreq2vocab.c Projet : AIDman/Kaldi

void main(int argc, char *argv[]) {

  int verbosity;
  int vocab_size;
  int cutoff;
  int num_recs;
  int current_rec;
  int num_above_threshold;
  int num_to_output;
  int i;
  word_rec *records;
  char temp_word[750];
  flag gt_set;
  flag top_set;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help")) {
    fprintf(stderr,"wfreq2vocab : Generate a vocabulary file from a word frequency file.\n");
    fprintf(stderr,"Usage : wfreq2vocab [ -top 20000 | -gt 10]\n");
    fprintf(stderr,"                    [ -records %d ]\n",DEFAULT_MAX_RECORDS);
    fprintf(stderr,"                    [ -verbosity %d]\n",DEFAULT_VERBOSITY);
    fprintf(stderr,"                    < .wfreq > .vocab\n");
    exit(1);
  }

  cutoff = pc_intarg( &argc, argv, "-gt",-1);
  vocab_size = pc_intarg(&argc, argv, "-top",-1);
  num_recs = pc_intarg(&argc, argv, "-records",DEFAULT_MAX_RECORDS);
  verbosity = pc_intarg(&argc, argv, "-verbosity",DEFAULT_VERBOSITY);
  
  pc_report_unk_args(&argc,argv,verbosity);

  if (cutoff != -1) {
    gt_set = 1;
  }
  else {
    gt_set = 0;
    cutoff = 0;
  }

  if (vocab_size != -1) {
    top_set = 1;
  }
  else {
    top_set = 0;
    vocab_size = 0;
  }
  
  if (gt_set && top_set) {
    quit(-1,"wfreq2vocab : Error : Can't use both the -top and the -gt options.\n");
  }



  if (!gt_set && !top_set) {
    vocab_size = 20000;
  }

  if (gt_set) {
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing all words which\n              occurred more that %d times. Reading wfreq stream from stdin...\n",cutoff);
  }
  else {
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing the most\n              frequent %d words. Reading wfreq stream from stdin...\n",vocab_size);
  }


  records = (word_rec *) rr_malloc(sizeof(word_rec)*num_recs);

  current_rec = 0;
  num_above_threshold = 0;
  
  while (!rr_feof(stdin)) {

    if (scanf("%s %d",temp_word,&(records[current_rec].count)) != 2) {
      if (!rr_feof(stdin)) {
	quit(-1,"Error reading unigram counts from standard input.\n");
      }
    }
    else {
      records[current_rec].word = salloc(temp_word);
      if (gt_set && records[current_rec].count > cutoff) {
	num_above_threshold++;
      }
      current_rec++;
    }
  }

  /* Sort records in descending order of count */

  qsort((void*) records,(size_t) current_rec, sizeof(word_rec),sort_by_count);

  if (gt_set) {
    num_to_output = num_above_threshold;
  }
  else {
    num_to_output = vocab_size;
  }

  if (current_rec<num_to_output) {
    num_to_output = current_rec;
  }

  /* Now sort the relevant records alphabetically */

  qsort((void*) records,(size_t) num_to_output, sizeof(word_rec),sort_alpha);

  if (gt_set) {
    pc_message(verbosity,2,"Size of vocabulary = %d\n",num_to_output);
  }
  
  if (num_to_output>65535) {
    pc_message(verbosity,1,"Warning : Vocab size exceeds 65535. This will cause problems with \nother tools, since word id's are stored in 2 bytes.\n");
  }

  /* Print the vocab to stdout */
  
  printf("## Vocab generated by v2 of the CMU-Cambridge Statistcal\n");
  printf("## Language Modeling toolkit.\n");
  printf("##\n");
  printf("## Includes %d words ",num_to_output);
  printf("##\n");

  for (i=0;i<=num_to_output-1;i++) {
    printf("%s\n",records[i].word);
  }

  pc_message(verbosity,0,"wfreq2vocab : Done.\n");

  exit(0);

}

Exemple #2

0

Afficher le fichier

Fichier : evallm.c Projet : JonGBowen/GoodVibes

int oe_02_main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  int generate_size;
  int random_seed;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  vocab_sz_t current_cc_id;
  FILE *context_cues_fp;
  int n;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
    oe_02_help_message();
    exit(1);
  }

  lm_filename_arpa = rr_salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,""))
    arpa_lm = 1;
  else
    arpa_lm = 0;

  lm_filename_binary = rr_salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,""))
    binary_lm = 1;
  else
    binary_lm = 0;

  if (arpa_lm && binary_lm)
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  
  if (!arpa_lm && !binary_lm)
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");

  ccs_filename = rr_salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,""))
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  n=arpa_lm?
    arpa_ng.n:
    ng.n;

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);

	warn_on_wrong_vocab_comments(wlist_entry);
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0)
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %lld\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : \n");
    fgets(input_string, sizeof(input_string), stdin);
    if(strlen(input_string) < sizeof(input_string)-1)
      input_string[strlen(input_string)-1] = '\0'; //chop new-line
    else 
      quit(1, "evallm input exceeds size of input buffer");

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-backoff_from_list",""));
    
      text_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	rr_salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = rr_salloc(pc_stringarg(&num_of_args,args,"-oovs",""));

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }else
	    /* do perplexity sentence by sentence [20090612] (air) */
	    if (!strcmp(args[0],"uttperp")) {
	      FILE *uttfh,*tempfh;
	      char utt[4096]; /* live dangerously... */
	      char tmpfil[128];
	      if ((uttfh = fopen(text_stream_filename,"r")) == NULL) {
		printf("Error: can't open %s\n",text_stream_filename);
		exit(1);
	      }
            char *template = "uttperp_XXXXXX";// CHANGED HLW
            mkstemp(template);// CHANGED HLW

Exemple #3

0

Afficher le fichier

Fichier : idngram2stats.c Projet : QuinnEbert/zedom8or

int oe_03_main (int argc, char **argv) {

  flag first_ngram;
  int n;
  fof_sz_t fof_size;
  flag is_ascii;
  int verbosity;
  fof_t **fof_array;
  ngram_sz_t *num_kgrams;
  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count;
  int pos_of_novelty;
  int nlines;
  int i;

  report_version(&argc,argv);

  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {
    oe_04_help_message();
    exit(1);
  }

  is_ascii = pc_flagarg(&argc, argv,"-ascii_input");
  n = pc_intarg(&argc, argv,"-n",3);
  fof_size = pc_intarg(&argc, argv,"-fof_size",50);
  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);

  pc_report_unk_args(&argc,argv,verbosity);

  pc_message(verbosity,2,"n        = %d\n",n);
  pc_message(verbosity,2,"fof_size = %d\n",fof_size);

  current_ngram.n = n;
  previous_ngram.n = n;
  pos_of_novelty = n;
  
  fof_array = (fof_t **) rr_malloc(sizeof(fof_t *) * (n-1));
  for (i=0;i<=n-2;i++) 
    fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t));

  num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t));
  ng_count = (count_t *) rr_calloc(n-1,sizeof(count_t));

  current_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));
  previous_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));

  pc_message(verbosity,2,"Processing id n-gram file.\n");
  pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

  nlines = 0;
  first_ngram = 1;
  
  while (!rr_feof(stdin)) {
    
    if (!first_ngram)
      ngram_copy(&previous_ngram,&current_ngram,n);

    if (get_ngram(stdin,&current_ngram,is_ascii)) {

      nlines++;
      show_idngram_nlines(nlines, verbosity);
    
      /* Test for where this ngram differs from last - do we have an
	 out-of-order ngram? */
    
      if (!first_ngram)
        pos_of_novelty = ngram_find_pos_of_novelty(&current_ngram,&previous_ngram,n,nlines);
      else
        pos_of_novelty = 0;

      /* Add new N-gram */
     
      num_kgrams[n-2]++;
      if (current_ngram.count <= fof_size) 
	fof_array[n-2][current_ngram.count]++;

      if (!first_ngram) {
	for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
	  num_kgrams[i-1]++;
	  if (ng_count[i-1] <= fof_size) 
	    fof_array[i-1][ng_count[i-1]]++;
	  
	  ng_count[i-1] = current_ngram.count;
	}
      } else {
	for (i=n-2;i>=MAX(1,pos_of_novelty);i--) 
	  ng_count[i-1] = current_ngram.count;
      }
	
      for (i=0;i<=pos_of_novelty-2;i++) 
	ng_count[i] += current_ngram.count;
	
      if (first_ngram)
        first_ngram = 0;
    }
  }

  /* Process last ngram */

  for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
    num_kgrams[i-1]++;
    if (ng_count[i-1] <= fof_size) {
      fof_array[i-1][ng_count[i-1]]++;
    }
    ng_count[i-1] = current_ngram.count;
  }
 #import "OpenEarsStaticAnalysisToggle.h"
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__ 
  for (i=0;i<=pos_of_novelty-2;i++)
    ng_count[i] += current_ngram.count;

  display_fof_array(num_kgrams,fof_array,fof_size,stderr, n);
#endif
  pc_message(verbosity,0,"idngram2stats : Done.\n");

  exit(0);
  
}

Exemple #4

0

Afficher le fichier

Fichier : text2idngram.c Projet : AIDman/Kaldi

void main(int argc, char *argv[]) {

  int i,j;

  char *vocab_filename;
  FILE *tempfile;
  char tempfiles_directory[1000];
  int vocab_size;
  FILE *vocab_file;

  int verbosity;

  int buffer_size;
  int position_in_buffer;
  int number_of_tempfiles;
  int max_files;
  int fof_size;

  unsigned short *buffer;
  unsigned short *placeholder;
  unsigned short *temp_ngram;
  int temp_count;
  
  char temp_word[500];
  char temp_word2[500];

  char *temp_file_root;
  char *temp_file_ext;
  char *host_name;
  int proc_id;
  struct utsname uname_info;

  flag write_ascii;

  /* Vocab hash table things */

  struct hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  tempfile = NULL; /* Just to prevent compilation warnings. */

  report_version(&argc,argv);

  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Process command line */
  
  if (pc_flagarg( &argc, argv,"-help") || argc==1) {
    fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n");
    fprintf(stderr,"Usage : text2idngram  -vocab .vocab \n");
    fprintf(stderr,"                    [ -buffer 100 ]\n");
    fprintf(stderr,"                    [ -hash %d ]\n",DEFAULT_HASH_SIZE);
    fprintf(stderr,"                    [ -temp %s ]\n",DEFAULT_TEMP);
    fprintf(stderr,"                    [ -files %d ]\n",DEFAULT_MAX_FILES);
    fprintf(stderr,"                    [ -gzip | -compress ]\n");
    fprintf(stderr,"                    [ -verbosity %d ]\n",
	    DEFAULT_VERBOSITY);
    fprintf(stderr,"                    [ -n 3 ]\n");
    fprintf(stderr,"                    [ -write_ascii ]\n");
    fprintf(stderr,"                    [ -fof_size 10 ]\n");
    exit(1);
  }

  pc_message(verbosity,2,"text2idngram\n");

  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);

  placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);

  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");

  fof_size = pc_intarg(&argc,argv,"-fof_size",10);

  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);

  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  
  if (!strcmp("",vocab_filename)) {
    quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n");
  }
    
  strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp", 
					   DEFAULT_TEMP));

  if (pc_flagarg(&argc,argv,"-compress")) {
    temp_file_ext = salloc(".Z");
  }
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) {
      temp_file_ext = salloc(".gz");
    }
    else {
      temp_file_ext = salloc("");
    }
  }

  uname(&uname_info);

  host_name = salloc(uname_info.nodename);

  proc_id = getpid();

  sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);

  temp_file_root = salloc(temp_word);

  pc_report_unk_args(&argc,argv,verbosity);
  
  /* If the last charactor in the directory name isn't a / then add one. */
  
  if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') {
    strcat(tempfiles_directory,"/");
  }
  
  pc_message(verbosity,2,"Vocab                  : %s\n",vocab_filename);
  pc_message(verbosity,2,"N-gram buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size        : %d\n",hash_size);
  pc_message(verbosity,2,"Temp directory         : %s\n",tempfiles_directory);
  pc_message(verbosity,2,"Max open files         : %d\n",max_files);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  
  pc_message(verbosity,2,"n                      : %d\n",n);

  buffer_size *= (1000000/(sizeof(unsigned short)*n));

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) {
      fprintf(stderr,"======================================================\n");
      fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word);
      fprintf(stderr,"=======================================================\n");
    }
    if (strncmp(temp_word,"#",1)==0) {
      fprintf(stderr,"\n\n===========================================================\n");
      fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
      fprintf(stderr,     ">>> %s <<<\n",temp_word);
      fprintf(stderr,     "         '%s' will be included in the vocabulary.\n",temp_word2);
      fprintf(stderr,     "         (comments must start with '##')\n");
      fprintf(stderr,"===========================================================\n\n");
    }
    vocab_size++;
    add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size);
  }

  if (vocab_size > MAX_VOCAB_SIZE) {
    quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n");
  }   
  
  pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n");

  buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short));

  number_of_tempfiles = 0;

  /* Read text into buffer */

  /* Read in the first ngram */

  position_in_buffer = 0;

  for (i=0;i<=n-1;i++) {
    get_word(stdin,temp_word);
    add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(stdin)) {

    /* Fill up the buffer */

    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
    while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) {
      position_in_buffer++;
      if (position_in_buffer % 20000 == 0) {
	if (position_in_buffer % 1000000 == 0) {
	  pc_message(verbosity,2,".\n");
	}
	else {
	  pc_message(verbosity,2,".");
	}
      }
      for (i=1;i<=n-1;i++) {
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      }
      if (get_word(stdin,temp_word) == 1) {
	add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) {
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);
    }

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");
    
    qsort((void*) buffer,(size_t) position_in_buffer,
	  n*sizeof(unsigned short),compare_ngrams);

    /* Output the buffer to temporary BINARY file */
    
    number_of_tempfiles++;

    sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    tempfile = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
      if (temp_ngram[i] > MAX_VOCAB_SIZE) {
	quit(-1,"Invalid trigram in buffer.\nAborting");

      }
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {
 
      if (!compare_ngrams(temp_ngram,&buffer[i*n])) {
	temp_count++;
      }
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1,
		    tempfile,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite(&temp_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	temp_count = 1;
      }
    }
    
    rr_oclose(tempfile);

    for (i=0;i<=n-1;i++) {
      add_to_buffer(placeholder[i],0,i,buffer);
    }

    position_in_buffer = 0;

  }

  /* Merge the temporary files, and output the result to standard output */

  pc_message(verbosity,2,"Merging temporary files...\n");
  
  merge_tempfiles(1,
		  number_of_tempfiles,
		  temp_file_root,
		  temp_file_ext,
		  max_files,
		  tempfiles_directory,
		  stdout,
		  write_ascii,
		  fof_size); 

  pc_message(verbosity,0,"text2idngram : Done.\n");

  exit(0);
  
}

Exemple #5

0

Afficher le fichier

Fichier : text2wngram.c Projet : Ankit77/cmusphinx

int main (int argc, char **argv) {

  int n;
  int verbosity;
  int max_files;
  int max_words;
  int max_chars;

  int current_word;
  int current_char;
  int start_char;		/* start boundary (possibly > than 0) */

  int no_of_spaces;
  int pos_in_string;

  int i;
  char *current_string;
  char current_temp_filename[500];
  int current_file_number;
  FILE *temp_file;

  flag text_buffer_full;

  char *text_buffer;
  char **pointers;

  char current_ngram[500];
  int current_count;

  int counter;

  char temp_directory[1000];
  char *temp_file_ext;

  flag words_set;
  flag chars_set;

  /* Process command line */

  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);
  pc_message(verbosity,2,"text2wngram\n");

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help")) {
    help_message();
    exit(1);
  }

  n = pc_intarg(&argc, argv,"-n",DEFAULT_N);

  /*  max_words = pc_intarg(&argc, argv,"-words",STD_MEM*1000000/11);
  max_chars = pc_intarg(&argc, argv,"-chars",STD_MEM*7000000/11); */

  max_words = pc_intarg(&argc, argv,"-words",-1);
  max_chars = pc_intarg(&argc, argv,"-chars",-1);

  if (max_words == -1) {
    words_set = 0;
    max_words = STD_MEM*1000000/11;
  }else
    words_set = 1;

  if (max_chars == -1) {
    chars_set = 0;
    max_chars = STD_MEM*7000000/11; 
  }else
    chars_set = 1;
  
  max_files = pc_intarg(&argc, argv,"-files",DEFAULT_MAX_FILES);

  if (pc_flagarg(&argc,argv,"-compress"))
    temp_file_ext = salloc(".Z");
  else {
    if (pc_flagarg(&argc,argv,"-gzip"))
      temp_file_ext = salloc(".gz");
    else
      temp_file_ext = salloc("");
  }

  strcpy(temp_directory, "cmuclmtk-XXXXXX");
  if (mkdtemp(temp_directory) == NULL) {
     quit(-1, "Failed to create temporary folder: %s\n", strerror(errno));
  }

  pc_report_unk_args(&argc,argv,verbosity);
 
  if (words_set && !chars_set)
    max_chars = max_words * 7;

  if (!words_set && chars_set)
    max_words = max_chars / 7;

  /* If the last charactor in the directory name isn't a / then add one. */
  
  pc_message(verbosity,2,"n = %d\n",n);
  pc_message(verbosity,2,"Number of words in buffer = %d\n",max_words);
  pc_message(verbosity,2,"Number of chars in buffer = %d\n",max_chars);
  pc_message(verbosity,2,"Max number of files open at once = %d\n",max_files);
  pc_message(verbosity,2,"Temporary directory = %s\n",temp_directory);

  /* Allocate memory for the buffers */

  text_buffer = (char *) rr_malloc(sizeof(char)*max_chars);
  pc_message(verbosity,2,"Allocated %d bytes to text buffer.\n",
	     sizeof(char)*max_chars);

  pointers = (char **) rr_malloc(sizeof(char *)*max_words);
  pc_message(verbosity,2,"Allocated %d bytes to pointer array.\n",
	     sizeof(char *)*max_words);

  current_file_number = 0;

  current_word = 1;
  start_char = 0;
  current_char = 0;
  counter = 0;
  pointers[0] = text_buffer;
      
  while (!feof(stdin)) {

    current_file_number++;

    /* Read text into buffer */
    
    pc_message(verbosity,2,"Reading text into buffer...\n");

    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 words processed for each \".\", 1,000,000 for each line.\n");
    
    pointers[0] = text_buffer;
    
    while ((!rr_feof(stdin)) && 
	   (current_word < max_words) && 
	   (current_char < max_chars)) {

      text_buffer[current_char] = getchar();
      if (text_buffer[current_char] == '\n' || 
	  text_buffer[current_char] == '\t' ) {
	text_buffer[current_char] = ' ';
      }
      if (text_buffer[current_char] == ' ') {
	if (current_char > start_char) {
	  if (text_buffer[current_char-1] == ' ') {
	    current_word--;
	    current_char--;
	  }
	  pointers[current_word] = &(text_buffer[current_char+1]);
	  current_word++; 
	  counter++;
	  if (counter % 20000 == 0) {
	    if (counter % 1000000 == 0)
	      pc_message(verbosity,2,"\n");
	    else
	      pc_message(verbosity,2,".");
	  }
	}
      }
      
      if (text_buffer[current_char] != ' ' || current_char > start_char) 
	current_char++;
    }

    text_buffer[current_char]='\0';


    if (current_word == max_words || rr_feof(stdin)) {
      for (i=current_char+1;i<=max_chars-1;i++)
	text_buffer[i] = ' ';

      text_buffer_full = 0;
    }else
      text_buffer_full = 1;
    
    /* Sort buffer */

    pc_message(verbosity,2,"\nSorting pointer array...\n"); 

    qsort((void *) pointers,(size_t) current_word-n,sizeof(char *),cmp_strings);
   
    /* Write out temporary file */

    sprintf(current_temp_filename,"%s/%hu%s",temp_directory, current_file_number, temp_file_ext);

    pc_message(verbosity,2,"Writing out temporary file %s...\n",current_temp_filename);
        
    temp_file = rr_oopen(current_temp_filename);
    text_buffer[current_char] = ' ';
    
    current_count = 0;
    strcpy(current_ngram,"");
    
    for (i = 0; i <= current_word-n; i++) {
      current_string = pointers[i];
      
      /* Find the nth space */

      no_of_spaces = 0;
      pos_in_string = 0;
      while (no_of_spaces < n) {	
	if (current_string[pos_in_string] == ' ')
	  no_of_spaces++;

	pos_in_string++;
      }
      
      if (!strncmp(current_string,current_ngram,pos_in_string))
	current_count++;
      else {
	if (strcmp(current_ngram,""))
	  if (fprintf(temp_file,"%s %d\n",current_ngram,current_count) < 0) 
	    quit(-1,"Error writing to temporary file %s\n",current_temp_filename);

	current_count = 1;
	strncpy(current_ngram,current_string,pos_in_string);
	current_ngram[pos_in_string] = '\0';
      }
    }
    
    rr_oclose(temp_file);

    /* Move the last n-1 words to the beginning of the buffer, and set
       correct current_word and current_char things */

    strcpy(text_buffer,pointers[current_word-n]);
    pointers[0]=text_buffer;
   
    /* Find the (n-1)th space */

    no_of_spaces=0;
    pos_in_string=0;

    if (!text_buffer_full){ 
      while (no_of_spaces<(n-1)) {
	if (pointers[0][pos_in_string]==' ') {
	  no_of_spaces++;
	  pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
	}
	pos_in_string++;
      }
    }else {
      while (no_of_spaces<n) {
	if (pointers[0][pos_in_string]==' ') {
	  no_of_spaces++;
	  pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
	}
	pos_in_string++;
      }
      pos_in_string--;
    }

    current_char = pos_in_string;
    current_word = n;
    /* mark boundary beyond which counting pass cannot backup */
    start_char = current_char;

  }
  /* Merge temporary files */

  pc_message(verbosity,2,"Merging temporary files...\n");

  merge_tempfiles(1,
		  current_file_number,
		  temp_directory,
		  temp_file_ext,
		  max_files,
		  stdout,
		  n,
		  verbosity); 

  rmdir(temp_directory);
  pc_message(verbosity,0,"text2wngram : Done.\n");
  
  return 0;
}

Exemple #6

0

Afficher le fichier

Fichier : wngram2idngram.c Projet : Ankit77/cmusphinx

int main(int argc, char *argv[]) {

  int verbosity;
  int vocab_size;
  FILE *vocab_file;
  int buffer_size;
  flag write_ascii;
  int max_files;
  int number_of_tempfiles;
  char *vocab_filename;
  char *idngram_filename;
  char temp_word[MAX_WORD_LENGTH];
  char temp_word2[MAX_WORD_LENGTH];
  char temp_word3[MAX_WORD_LENGTH];
  flag contains_unks;
  int position_in_buffer;
  FILE *outfile;
  FILE *tempfile;
  FILE *non_unk_fp;
  ngram_rec *buffer;
  flag same_ngram;
  int i;
  int j;
  int fof_size;
  int size_of_rec;

  char temp_directory[1000];
  char *temp_file_ext;

  /* Vocab hash table things */

  struct idngram_hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  wordid_t *current_ngram;
  int current_count;
  wordid_t *sort_ngram;
  int sort_count;
  
  /* Process command line */

  report_version(&argc,argv);
  
  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }


  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);
  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");
  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);
  fof_size = pc_intarg(&argc,argv,"-fof_size",10);
  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  idngram_filename = salloc(pc_stringarg( &argc, argv, "-idngram", "" ));
  
  if (!strcmp("",vocab_filename)) 
    quit(-1,"Error : Must specify a vocabulary file.\n");

  if (!strcmp("",idngram_filename)) 
    quit(-1,"text2idngram : Error : Must specify idngram file.\n");
    
  if (pc_flagarg(&argc,argv,"-compress")) 
    temp_file_ext = salloc(".Z");
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) 
      temp_file_ext = salloc(".gz");
    else 
      temp_file_ext = salloc("");
  }

  strcpy(temp_directory, "cmuclmtk-XXXXXX");
  if (mkdtemp(temp_directory) == NULL) {
     quit(-1, "Failed to create temporary folder: %s\n", strerror(errno));
  }

  pc_report_unk_args(&argc,argv,verbosity);

  outfile = rr_fopen(idngram_filename,"wb");
  
  pc_message(verbosity,2,"Vocab           : %s\n",vocab_filename);
  pc_message(verbosity,2,"Output idngram  : %s\n",idngram_filename);
  pc_message(verbosity,2,"Buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size : %d\n",hash_size);
  pc_message(verbosity,2,"Max open files  : %d\n",max_files);
  pc_message(verbosity,2,"n               : %d\n",n);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  

  size_of_rec = (sizeof(wordid_t) * n) + 16 - (( n* sizeof(wordid_t)) % 16);
  buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec)));
  fprintf(stderr,"buffer size = %d\n",buffer_size);

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_idngram_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for vocabulary order */
    if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) 
      quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n");

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) 
      warn_on_repeated_words(temp_word);

    warn_on_wrong_vocab_comments(temp_word);

    vocab_size++;
    
    add_to_idngram_hashtable(&vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size);
    strcpy(temp_word3,temp_word2);
  }

  if (vocab_size > MAX_VOCAB_SIZE) 
    quit(-1,"Error : Vocabulary size exceeds maximum.\n");
  
  pc_message(verbosity,2,"Allocating memory for the buffer...\n");

  buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec));
  
  for (i=0;i<=buffer_size;i++) 
    buffer[i].word = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Open the "non-OOV" tempfile */

  sprintf(temp_word, "%s/1%s", temp_directory, temp_file_ext);
  
  non_unk_fp = rr_fopen(temp_word,"w");

  pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n",
	     temp_word);
  number_of_tempfiles = 1;

  current_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));
  sort_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Read text into buffer */
  position_in_buffer = 0;

  while (!rr_feof(stdin)) {
    
    for (i=0;i<=n-1;i++) {
      get_word(stdin,temp_word);
      current_ngram[i]=index2(&vocabulary,temp_word);
    }
    if (scanf("%d",&current_count) != 1) 
      if (!rr_feof(stdin)) 
	quit(-1,"Error reading n-gram count from stdin.\n");

    if (!rr_feof(stdin)) {

      contains_unks = 0;
      for (i=0;i<=n-1;i++) {
	if (!current_ngram[i]) 
	  contains_unks = 1;
      }

      if (contains_unks) {
	/* Write to buffer */

	position_in_buffer++;

	if (position_in_buffer >= buffer_size) {

	  /* Sort buffer */
	  pc_message(verbosity,2,
		     "Sorting n-grams which include an OOV word...\n");

	  qsort((void*) buffer,(size_t) position_in_buffer,
		sizeof(ngram_rec),compare_ngrams2);

	  pc_message(verbosity,2,"Done.\n");

	  /* Write buffer to temporary file */

	  number_of_tempfiles++;
	  
	  sprintf(temp_word,"%s/%hu%s", temp_directory,
		  number_of_tempfiles,temp_file_ext);
	  
	  pc_message(verbosity,2,
		     "Writing sorted OOV-counts buffer to temporary file %s\n",
		     temp_word);

	  tempfile = rr_fopen(temp_word,"w");
	  
	  for (i=0;i<=n-1;i++) 
	    sort_ngram[i] = buffer[0].word[i];

	  sort_count = buffer[0].count;

	  for (i=0;i<=position_in_buffer-2;i++) {
	    
	    same_ngram = 1;
	    for (j=n-1;j>=0;j--) {
	      if (buffer[i].word[j] != sort_ngram[j]) {
		same_ngram = 0;
		j = -1;
	      }
	    }

	    if (same_ngram) 
	      sort_count += buffer[i].count;
	    else {
	      for (j=0;j<=n-1;j++) {
		rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
			  tempfile,"temporary n-gram ids");
		sort_ngram[j] = buffer[i].word[j];
	      }
	      rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
			"temporary n-gram counts");
	      sort_count = buffer[i].count;
	    }
	  }	    
	  for (j=0;j<=n-1;j++) 
	    rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		      tempfile,"temporary n-gram ids");

	  rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		    "temporary n-gram counts");
	  rr_oclose(tempfile);
	  position_in_buffer = 1;

	}
	
	for (i=0;i<=n-1;i++) 
	  buffer[position_in_buffer-1].word[i] = current_ngram[i];

	buffer[position_in_buffer-1].count = current_count;

      }else {
	/* Write to temporary file */
	for (i=0;i<=n-1;i++) 
	  rr_fwrite((char*)&current_ngram[i],sizeof(wordid_t),1,
		    non_unk_fp,"temporary n-gram ids");

	rr_fwrite((char*)&current_count,sizeof(int),1,non_unk_fp,
		  "temporary n-gram counts");
      }
    }
  }

  if (position_in_buffer > 0) {

    /* Only do this bit if we have actually seen some OOVs */
    /* Sort final buffer */    
    pc_message(verbosity,2,"Sorting final buffer...\n");

    qsort((void*) buffer,(size_t) position_in_buffer,
	  sizeof(ngram_rec),compare_ngrams2);
    
    /* Write final buffer */
    
    number_of_tempfiles++;
  
    sprintf(temp_word,"%s/%hu%s", temp_directory,
	    number_of_tempfiles,temp_file_ext);
    
    pc_message(verbosity,2,"Writing sorted buffer to temporary file %s\n", temp_word);

    tempfile = rr_fopen(temp_word,"w");
    
    for (i=0;i<=n-1;i++) 
      sort_ngram[i] = buffer[0].word[i];

    sort_count = buffer[0].count;
    
    for (i=1;i<=position_in_buffer-1;i++) {
      
      same_ngram = 1;
      for (j=n-1;j>=0;j--) {
	if (buffer[i].word[j] != sort_ngram[j]) {
	  same_ngram = 0;
	  j = -1;
	}
      }
      
      if (same_ngram) 
	sort_count += buffer[i].count;
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		    tempfile,"temporary n-gram ids");
	  sort_ngram[j] = buffer[i].word[j];
	}
	rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	sort_count = buffer[i].count;
      }
    }	    
    for (j=0;j<=n-1;j++) 
      rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		tempfile,"temporary n-gram ids");

    rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
	      "temporary n-gram counts");
    fclose(tempfile);
    

  }
  

  /* Merge the temporary files, and output the result */
  fclose(non_unk_fp);
  pc_message(verbosity,2,"Merging temporary files...\n");
  merge_idngramfiles(1,
		     number_of_tempfiles,
		     temp_directory,
		     temp_file_ext,
		     max_files,
		     outfile,
		     write_ascii,
		     fof_size,
		     n);

  fclose(outfile);

  rmdir(temp_directory);
  pc_message(verbosity,0,"wngram2idngram : Done.\n");

  return 0;
}

Exemple #7

0

Afficher le fichier

Fichier : ngram2mgram.c Projet : AIDman/Kaldi

void main(int argc, char *argv[]) {

  int verbosity;
  int n;
  int m;
  int i;
  int input_type;
  int storage_type;
  unsigned short *current_ngram_int;
  unsigned short *previous_ngram_int;
  char **current_ngram_text;
  char **previous_ngram_text;
  int current_count;
  int running_total;
  flag same;
  flag first_one;
  flag got_to_eof;
   
  running_total = 0;

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help") || argc==1) {
    fprintf(stderr,"ngram2mgram - Convert an n-gram file to an m-gram file, where m<n\n");
    fprintf(stderr,"Usage : ngram2mgram   -n N -m M\n");
    fprintf(stderr,"                    [ -binary | -ascii | -words ]\n");
    fprintf(stderr,"                    < .ngram > .mgram\n");
    exit(1);
  }
 
  n = pc_intarg( &argc, argv,"-n",0);
  m = pc_intarg( &argc, argv,"-m",0);
  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
  

  input_type = 0;
  
  if (pc_flagarg( &argc, argv,"-binary")) {
    input_type = BINARY;
  }

  if (pc_flagarg( &argc, argv,"-ascii")) {
    if (input_type != 0) {
      quit(-1,"Error : more than one file format specified.\n");
    }
    input_type = ASCII;
  }

  if (pc_flagarg( &argc, argv,"-words")) {  
    if (input_type != 0) {
      quit(-1,"Error : more than one file format specified.\n");
    }
    input_type = WORDS;
  }    

  if (input_type == 0) {
    pc_message(verbosity,2,"Warning : no input type specified. Defaulting to binary.\n");
    input_type = BINARY;
  }

  if (n == 0) {
    quit(-1,"Must specify a value for n. Use the -n switch.\n");
  }

  if (m == 0) {
    quit(-1,"Must specify a value for m. Use the -m switch.\n");
  }
  
  if (n<=m) {
    quit(-1,"n must be greater than m.\n");
  }

  pc_report_unk_args(&argc,argv,verbosity);

  if (input_type == BINARY || input_type == ASCII) {
    storage_type = NUMERIC;
  }
  else {
    storage_type = ALPHA;
  }

  if (storage_type == NUMERIC) {
    current_ngram_int = (unsigned short *) 
      rr_malloc(n*sizeof(unsigned short));
    previous_ngram_int = (unsigned short *) 
      rr_malloc(n*sizeof(unsigned short));

    /* And to prevent compiler warnings ... */

    current_ngram_text = NULL;
    previous_ngram_text = NULL;
  }
  else {
    current_ngram_text = (char **) rr_malloc(n*sizeof(char *));
    previous_ngram_text = (char **) rr_malloc(n*sizeof(char *));
    for (i=0;i<=n-1;i++) {
      current_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
      previous_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
    }

    /* And to prevent compiler warnings ... */

    current_ngram_int = NULL;
    previous_ngram_int = NULL;

  }

  got_to_eof = 0;
  first_one = 1;

  while (!rr_feof(stdin)) {

    /* Store previous n-gram */

    if (!first_one) {

      if (storage_type == NUMERIC) {
	for (i=0;i<=n-1;i++) {
	  previous_ngram_int[i] = current_ngram_int[i];
	}
      }
      else {
	for (i=0;i<=n-1;i++) {
	  strcpy(previous_ngram_text[i],current_ngram_text[i]);
	}
      }

    }

    /* Read new n-gram */

    switch(input_type) {
    case BINARY:
      for (i=0;i<=n-1;i++) {
	rr_fread(&current_ngram_int[i],sizeof(id__t),1,stdin,
		 "from id_ngrams at stdin",0);
      }
      rr_fread(&current_count,sizeof(count_t),1,stdin,
	       "from id_ngrams file at stdin",0);
      break;
    case ASCII:
      for (i=0;i<=n-1;i++) {
	if (fscanf(stdin,"%hu",&current_ngram_int[i]) != 1) {
	  if (!rr_feof(stdin)) {
	    quit(-1,"Error reading id_ngram.\n");
	  }
	  else {
	    got_to_eof = 1;
	  }
	}
      }
      if (fscanf(stdin,"%d",&current_count) != 1) {
	if (!rr_feof(stdin)) {
	  quit(-1,"Error reading id_ngram.\n");
	}
	else {
	  got_to_eof = 1;
	}
      }
      break;
    case WORDS:
      for (i=0;i<=n-1;i++) {
	if (fscanf(stdin,"%s",current_ngram_text[i]) != 1) {
	  if (!rr_feof(stdin)) {
	    quit(-1,"Error reading id_ngram.\n");
	  }
	  else {
	    got_to_eof = 1;
	  }
	}
      }
      if (fscanf(stdin,"%d",&current_count) != 1) {
	if (!rr_feof(stdin)) {
	  quit(-1,"Error reading id_ngram.\n");
	}
	else {
	  got_to_eof = 1;
	}
      }
      break;
    }

    if (!got_to_eof) {

      /* Check for correct sorting */

      if (!first_one) {

	switch(storage_type) {
	case NUMERIC:
	  for (i=0;i<=n-1;i++) {
	    if (current_ngram_int[i]<previous_ngram_int[i]) {
	      quit(-1,"Error : ngrams not correctly sorted.\n");
	    }
	    else {
	      if (current_ngram_int[i]>previous_ngram_int[i]) {
		i=n;
	      }
	    }
	  }
	  break;
	case ALPHA:
	  for (i=0;i<=n-1;i++) {
	    if (strcmp(current_ngram_text[i],previous_ngram_text[i])<0) {
	      quit(-1,"Error : ngrams not correctly sorted.\n");
	    }
	    else {
	      if (strcmp(current_ngram_text[i],previous_ngram_text[i])>0) {
		i=n;
	      }
	    }
	  }
	  break;
	}
      }

      /* Compare this m-gram with previous m-gram */

      if (!first_one) {

	switch(storage_type) {
	case NUMERIC:
	  same = 1;
	  for (i=0;i<=m-1;i++) {
	    if (current_ngram_int[i] != previous_ngram_int[i]) {
	      same = 0;
	    }
	  }
	  if (same) {
	    running_total += current_count;
	  }
	  else {
	    if (input_type == ASCII) {
	      for (i=0;i<=m-1;i++) {
		printf("%d ",previous_ngram_int[i]);
	      }
	      printf("%d\n",running_total);
	    }
	    else {
	      for (i=0;i<=m-1;i++) {
		rr_fwrite(&previous_ngram_int[i],sizeof(id__t),1,stdout,
			  "to id_ngrams at stdout");
	      }
	      rr_fwrite(&running_total,sizeof(count_t),1,stdout,
			"to id n-grams at stdout");
	    }
	    running_total = current_count;
	  }
	  break;
	case ALPHA:
	  same = 1;
	  for (i=0;i<=m-1;i++) {
	    if (strcmp(current_ngram_text[i],previous_ngram_text[i])) {
	      same = 0;
	    }
	  }
	  if (same) {
	    running_total += current_count;
	  }
	  else {
	    for (i=0;i<=m-1;i++) {
	      printf("%s ",previous_ngram_text[i]);
	    }
	    printf("%d\n",running_total);
	    running_total = current_count;
	  
	  }
	  break;
	}
      
      }
      else {
	running_total = current_count;
      } 
    
      first_one = 0;
    
    }
  }

  /* Write out final m-gram */

  switch(input_type) {
  case BINARY:
    break;
  case ASCII:
    for (i=0;i<=m-1;i++) {
      printf("%d ",previous_ngram_int[i]);
    }
    printf("%d\n",running_total);
    break;
  case WORDS:
    for (i=0;i<=m-1;i++) {
      printf("%s ",previous_ngram_text[i]);
    }
    printf("%d\n",running_total);
    break;
  } 

  pc_message(verbosity,0,"ngram2mgram : Done.\n");

  exit(0);

}

Exemple #8

0

Afficher le fichier

Fichier : evallm.c Projet : hunterhector/LangStats12Spring

void main (int argc, char **argv) {

  ng_t ng;
  arpa_lm_t arpa_ng;
  char input_string[500];
  int num_of_args;
  char *args[MAX_ARGS];
  char *lm_filename_arpa;
  char *lm_filename_binary;
  flag told_to_quit;
  flag inconsistant_parameters;
  flag backoff_from_unk_inc;
  flag backoff_from_unk_exc;
  flag backoff_from_ccs_inc;
  flag backoff_from_ccs_exc;
  flag arpa_lm;
  flag binary_lm;
  flag include_unks;
  char *fb_list_filename;
  char *probs_stream_filename;
  char *annotation_filename;
  char *text_stream_filename;
  char *oov_filename;
  char *ccs_filename;
  double log_base;
  char wlist_entry[1024];
  char current_cc[200];
  int current_cc_id;
  FILE *context_cues_fp;
  int n;
  int generate_size = 10000;
  int random_seed;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg(&argc, argv,"-help") || 
      argc == 1 || 
      (strcmp(argv[1],"-binary") && strcmp(argv[1],"-arpa"))) {
   fprintf(stderr,"evallm : Evaluate a language model.\n");
   fprintf(stderr,"Usage : evallm [ -binary .binlm | \n");
   fprintf(stderr,"                 -arpa .arpa [ -context .ccs ] ]\n");
   exit(1);
  }

  lm_filename_arpa = salloc(pc_stringarg(&argc, argv,"-arpa",""));

  if (strcmp(lm_filename_arpa,"")) {
    arpa_lm = 1;
  }
  else {
    arpa_lm = 0;
  }

  lm_filename_binary = salloc(pc_stringarg(&argc, argv,"-binary",""));

  if (strcmp(lm_filename_binary,"")) {
    binary_lm = 1;
  }
  else {
    binary_lm = 0;
  }

  if (arpa_lm && binary_lm) {
    quit(-1,"Error : Can't use both -arpa and -binary flags.\n");
  }
  
  if (!arpa_lm && !binary_lm) {
    quit(-1,"Error : Must specify either a binary or an arpa format language model.\n");
  }

  ccs_filename = salloc(pc_stringarg(&argc, argv,"-context",""));

  if (binary_lm && strcmp(ccs_filename,"")) {
    fprintf(stderr,"Warning - context cues file not needed with binary language model file.\nWill ignore it.\n");
  }

  pc_report_unk_args(&argc,argv,2);
 
  /* Load language model */

  if (arpa_lm) {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_arpa);
    load_arpa_lm(&arpa_ng,lm_filename_arpa);
  }
  else {
    fprintf(stderr,"Reading in language model from file %s\n",
	    lm_filename_binary);
    load_lm(&ng,lm_filename_binary); 
  }

  fprintf(stderr,"\nDone.\n");

  if (!arpa_lm) {
    n=ng.n;
  }
  else {
    n=arpa_ng.n;
  }

  if (arpa_lm) {
    arpa_ng.context_cue = 
      (flag *) rr_calloc(arpa_ng.table_sizes[0],sizeof(flag));    
    arpa_ng.no_of_ccs = 0;
    if (strcmp(ccs_filename,"")) {
      context_cues_fp = rr_iopen(ccs_filename);
      while (fgets (wlist_entry, sizeof (wlist_entry),context_cues_fp)) {
	if (strncmp(wlist_entry,"##",2)==0) continue;
	sscanf (wlist_entry, "%s ",current_cc);
	if (strncmp(wlist_entry,"#",1)==0) {
	  fprintf(stderr,"\n\n===========================================================\n");
	  fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
	  fprintf(stderr,     ">>> %s <<<\n",wlist_entry);
	  fprintf(stderr,     "         '%s' will be included in the context cues list\n",current_cc);
	  fprintf(stderr,     "         (comments must start with '##')\n");
	  fprintf(stderr,"===========================================================\n\n");
	}
	
	
	if (sih_lookup(arpa_ng.vocab_ht,current_cc,&current_cc_id) == 0) {
	  quit(-1,"Error : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
	}
	
	arpa_ng.context_cue[(unsigned short) current_cc_id] = 1;
	arpa_ng.no_of_ccs++;
	fprintf(stderr,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
      }
      rr_iclose(context_cues_fp);
    }
  }

  /* Process commands */
  
  told_to_quit = 0;
  num_of_args = 0;

  while (!feof(stdin) && !told_to_quit) {
    printf("evallm : ");
    gets(input_string);

    if (!feof(stdin)) {
      parse_comline(input_string,&num_of_args,args);

  
      random_seed = pc_intarg(&num_of_args,args,"-seed",-1);

      generate_size = pc_intarg(&num_of_args,args,"-size",10000);

      log_base = pc_doublearg(&num_of_args,args,"-log_base",10.0);

      backoff_from_unk_inc = pc_flagarg(&num_of_args,args,
					"-backoff_from_unk_inc");
      backoff_from_ccs_inc = pc_flagarg(&num_of_args,args,
					"-backoff_from_ccs_inc");
      backoff_from_unk_exc = pc_flagarg(&num_of_args,args,
					"-backoff_from_unk_exc");
      backoff_from_ccs_exc = pc_flagarg(&num_of_args,args,
					"-backoff_from_ccs_exc");
      include_unks = pc_flagarg(&num_of_args,args,"-include_unks");
      fb_list_filename = salloc(pc_stringarg(&num_of_args,args,
					     "-backoff_from_list",""));
    
      text_stream_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-text",""));
      probs_stream_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-probs",""));
      annotation_filename = 
	salloc(pc_stringarg(&num_of_args,args,"-annotate",""));
      oov_filename = salloc(pc_stringarg(&num_of_args,args,"-oovs",""));


      inconsistant_parameters = 0;
    
      if (backoff_from_unk_inc && backoff_from_unk_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_unk_exc and -backoff_from_unk_inc\n");
	inconsistant_parameters = 1;
      }

      if (backoff_from_ccs_inc && backoff_from_ccs_exc) {
	fprintf(stderr,"Error : Cannot specify both exclusive and inclusive forced backoff.\n");
	fprintf(stderr,"Use only one of -backoff_from_ccs_exc and -backoff_from_ccs_inc\n");
	inconsistant_parameters = 1;
      }

      if (num_of_args > 0) {
      
	if (!inconsistant_parameters) {
	  if (!strcmp(args[0],"perplexity")) {
	    compute_perplexity(&ng,
			       &arpa_ng,
			       text_stream_filename,
			       probs_stream_filename,
			       annotation_filename,
			       oov_filename,
			       fb_list_filename,
			       backoff_from_unk_inc,
			       backoff_from_unk_exc,
			       backoff_from_ccs_inc,
			       backoff_from_ccs_exc,
			       arpa_lm,
			       include_unks,
			       log_base);
	  }
	  else {
	    if (!strcmp(args[0],"validate")) {

	      if (num_of_args != n) {
		fprintf(stderr,"Error : must specify %d words of context.\n",
			n-1);
	      }
	      else {
	      
		/* Assume last n-1 parameters form context */
	      
		validate(&ng,
			 &arpa_ng,
			 &(args[num_of_args-n+1]),
			 backoff_from_unk_inc,
			 backoff_from_unk_exc,
			 backoff_from_ccs_inc,
			 backoff_from_ccs_exc,
			 arpa_lm,
			 fb_list_filename);
	      }
	    }
	    else {
	      if (!strcmp(args[0],"stats")) {
		if (arpa_lm) {
		  display_arpa_stats(&arpa_ng);
		}
		else {
		  display_stats(&ng);
		}
	      }
	      else {
		if (!strcmp(args[0],"quit")) {
		  told_to_quit=1;
		}
        else if (!strcmp(args[0],"generate")) {

        if(arpa_lm)
          generate_words(NULL,&arpa_ng,generate_size,random_seed,text_stream_filename);
        else
          generate_words(&ng,NULL,generate_size,random_seed,text_stream_filename);

      }
		else {
		  if (!strcmp(args[0],"help")) {

		    printf("The user may specify one of the following commands: \n");
		    printf("\n");
		    printf(" - perplexity\n");
		    printf("\n");
		    printf("Computes the perplexity of a given text. May optionally specify words\n");
		    printf("from which to force back-off.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("perplexity -text .text\n");
		    printf("         [ -probs .fprobs ]\n");
		    printf("         [ -oovs .oov_file ]\n");
		    printf("         [ -annotate .annotation_file ]         \n");
		    printf("         [ -backoff_from_unk_inc | -backoff_from_unk_exc ]\n");
		    printf("         [ -backoff_from_ccs_inc | -backoff_from_ccs_exc ] \n");
		    printf("         [ -backoff_from_list .fblist ]\n");
		    printf("         [ -include_unks ]\n");
		    printf("\n");
		    printf(" - validate\n");
		    printf("       \n");
		    printf("Calculate the sum of the probabilities of all the words in the\n");
		    printf("vocabulary given the context specified by the user.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("validate [ -backoff_from_unk -backoff_from_ccs |\n");
		    printf("           -backoff_from_list .fblist ]\n");
		    printf("         [ -forced_backoff_inc | -forced_back_off_exc ]      \n");
		    printf("           word1 word2 ... word_(n-1)\n");
		    printf("\n");
		    printf("Where n is the n in n-gram. \n");
		    printf("\n");
		    printf(" - help\n");
		    printf("\n");
		    printf("Displays this help message.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("help\n");
		    printf("\n");
		    printf(" - quit\n");
		    printf("\n");
		    printf("Exits the program.\n");
		    printf("\n");
		    printf("Syntax: \n");
		    printf("\n");
		    printf("quit\n");	

		  } 
	      
		  else {
		    fprintf(stderr,"Unknown command : %s\nType \'help\'\n",
			    args[0]);
		  }
		}
	      }
	    }
	  }
	}
      }
    }    
  }

  fprintf(stderr,"evallm : Done.\n");

  exit(0);
  
}

Exemple #9

0

Afficher le fichier

Fichier : idngram2lm.c Projet : 10v/cmusphinx

int main(int argc, char **argv) {

  int i,j;
  ng_t* ng;
  int verbosity;
  int mem_alloc_method; /* Method used to decide how much memory to 
			   allocate for count tables */
  int buffer_size;
  flag is_ascii;
  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count; /* Array indicating the number of occurrances of 
			   the current 1-gram, 2-gram, ... ,n-gram 
			   Size depends on #define in general.h
			*/  
  int nlines;
  int pos_of_novelty;
  int prev_id1;
  flag contains_unks;
  int mem_alloced;

  flag displayed_oov_warning; /** Display OOV warning 
			       */

  /*  ------------------  Process command line --------------------- */

  report_version(&argc,argv);

  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }

  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Initialization */
  {
    ng=init_ng(
	    &argc,
	    argv,
	    verbosity
	    );
    
    mem_alloc_method = init_alloc_method(ng, &argc, argv, &buffer_size);
    
    if (!strcmp(ng->id_gram_filename,"-") && mem_alloc_method == TWO_PASSES)
      quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n");
    
    is_ascii = set_lmformat(pc_flagarg(&argc,argv,"-ascii_input"),
			    pc_flagarg(&argc,argv,"-bin_input"),
			    ng);  

    /* Report parameters */
    report_param(verbosity,ng,
		 is_ascii, mem_alloc_method, buffer_size);

    pc_report_unk_args(&argc,argv,verbosity);

  }

  /* --------------- Read in the vocabulary -------------- */
  read_vocab(ng,verbosity);
       		     
  /* --------------- Allocate space for the table_size array --------- */
  init_ng_table_size(ng, 
		     mem_alloc_method,
		     is_ascii,
		     verbosity,
		     buffer_size
		     );

  /* ----------- Allocate memory for tree structure -------------- */

  ng->count = NULL;
  ng->count4 = NULL;
  ng->marg_counts = NULL;
  ng->marg_counts4 = NULL;
  ng->count_table = NULL;

  ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n);
  ng->count4 = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);    
  ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);

  if (ng->four_byte_counts) {
    ng->marg_counts4 = (count_t *) rr_calloc(sizeof(count_t), ng->table_sizes[0]);

  }else {
    for (i=0;i<=ng->n-1;i++) 
      ng->count_table[i] = (count_t *) rr_calloc(ng->count_table_size+1,
						sizeof(count_t));

    ng->marg_counts = (count_ind_t *) rr_calloc(sizeof(count_ind_t),ng->table_sizes[0]);
    fprintf(stderr, "table_size %d\n",ng->table_sizes[0]);
    fflush(stderr);
  }

  ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n);

  if (ng->four_byte_alphas) {
    ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n);
    ng->bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*
						ng->table_sizes[0]);
  }else {
    ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n);
    ng->bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*
						ng->table_sizes[0]);
  }

  ng->ind = (index__t **)  rr_malloc(sizeof(index__t *)*ng->n);

  /* First table */
  if (ng->four_byte_counts) 
    ng->count4[0] = (count_t *) rr_calloc(ng->table_sizes[0],sizeof(count_t));
  else 
    ng->count[0] = (count_ind_t *) rr_calloc(ng->table_sizes[0],sizeof(count_ind_t));

  ng->uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					   ng->table_sizes[0]);
  ng->uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					       ng->table_sizes[0]);

  if (ng->n >=2) 
    ng->ind[0] = (index__t *) rr_calloc(ng->table_sizes[0],sizeof(index__t));

  for (i=1;i<=ng->n-2;i++) {    
    ng->word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[i]);

    if (ng->four_byte_counts) 
      ng->count4[i] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[i]);
    else 
      ng->count[i] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[i]);

    if (ng->four_byte_alphas) 
      ng->bo_weight4[i] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*ng->table_sizes[i]);
    else 
      ng->bo_weight[i] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*ng->table_sizes[i]);
    
    ng->ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng->table_sizes[i]);

    mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) + 
		sizeof(index__t) + sizeof(id__t);
    
    if (ng->four_byte_alphas) 
      mem_alloced += 4;
   
    mem_alloced *= ng->table_sizes[i];
    
    pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
	       mem_alloced,i+1);
    
  }

  ng->word_id[ng->n-1] = (id__t *) 
    rr_malloc(sizeof(id__t)*ng->table_sizes[ng->n-1]);

  if (ng->four_byte_counts) 
    ng->count4[ng->n-1] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[ng->n-1]);    
  else 
    ng->count[ng->n-1] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[ng->n-1]);

  pc_message(verbosity,2,"Allocated (%d+%d) bytes to table for %d-grams.\n",
	     ng->four_byte_counts?sizeof(count_t):sizeof(count_ind_t),
	     sizeof(id__t)*ng->table_sizes[ng->n-1],ng->n);
  
  /* Allocate memory for table for first-byte of indices */

  ng_allocate_ptr_table(ng,NULL,0);

  /* Allocate memory for alpha array */

  ng->alpha_array = (double *) rr_malloc(sizeof(double)*ng->out_of_range_alphas);
  ng->size_of_alpha_array = 0;

  /* Allocate memory for frequency of frequency information */

  ng->freq_of_freq = (fof_t **) rr_malloc(sizeof(fof_t *)*ng->n);

  NG_DISC_METH(ng)->allocate_freq_of_freq(ng);

  /* Read n-grams into the tree */
  pc_message(verbosity,2,"Processing id n-gram file.\n");
  pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

  /* Allocate space for ngrams id arrays */

  current_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  previous_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  current_ngram.n = ng->n;
  previous_ngram.n = ng->n;
  
  ng->num_kgrams = (ngram_sz_t *) rr_calloc(ng->n,sizeof(ngram_sz_t));
  ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t));
  nlines = 1;
  ng->n_unigrams = 0;

  /* Process first n-gram */  
  get_ngram(ng->id_gram_fp,&current_ngram,is_ascii);
  contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);

  /* Skip over any unknown words.  They will come first, because <UNK>
     always has a word ID of zero. */
  while (ng->vocab_type == CLOSED_VOCAB && contains_unks){
    /* Stop looking if there are no more N-Grams.  Of course, this
       means training will fail, since there are no unigrams. */
    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii) == 0)
      break;
    contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);
  }

  for (i=0;i<=ng->n-2;i++) {
    ng->ind[i][0] = new_index(0,ng->ptr_table[i],&(ng->ptr_table_size[i]),0);
    ng->word_id[i+1][0] = current_ngram.id_array[i+1];
    ng->num_kgrams[i+1]++;
    ng_count[i] = current_ngram.count;
  }

  ng_count[0] = current_ngram.count;

  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);

  store_normal_count(ng,0,current_ngram.count,ng->n-1);

  if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
    ng->num_kgrams[ng->n-1]--;

  ngram_copy(&previous_ngram,&current_ngram,ng->n);

  prev_id1 = current_ngram.id_array[0];
    
  displayed_oov_warning = 0;

  while (!rr_feof(ng->id_gram_fp)) {

    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii)) {

      if (ng->vocab_type == CLOSED_VOCAB)
	contains_unks=ngram_chk_contains_unks(&current_ngram,ng->n);
    
      if (!contains_unks || ng->vocab_type != CLOSED_VOCAB) {

	/* Test for where this ngram differs from last - do we have an
	   out-of-order ngram? */
	pos_of_novelty = ngram_find_pos_of_novelty(&current_ngram,&previous_ngram,ng->n,nlines);
    
	nlines++; 
	show_idngram_nlines(nlines, verbosity);
    
	/* Add new n-gram as soon as it is encountered */
	/* If all of the positions 2,3,...,n of the n-gram are context
	   cues then ignore the n-gram. */
    
	if (ng->n > 1) {
	  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);
	        
	  store_normal_count(ng,ng->num_kgrams[ng->n-1],current_ngram.count,ng->n-1);
	  
	  ng->word_id[ng->n-1][ng->num_kgrams[ng->n-1]] = current_ngram.id_array[ng->n-1];
	  ng->num_kgrams[ng->n-1]++;	  
	  
	  if (ng->num_kgrams[ng->n-1] >= ng->table_sizes[ng->n-1])
	    quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[ng->n-1],ng->n);
	}
	/* Deal with new 2,3,...,(n-1)-grams */
      
	for (i=ng->n-2;i>=MAX(1,pos_of_novelty);i--) {

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);
	  
	  if (ng_count[i] <= ng->cutoffs[i-1]) 
	    ng->num_kgrams[i]--;
	  else
	    store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);

	  ng_count[i] = current_ngram.count;
	  ng->word_id[i][ng->num_kgrams[i]] = current_ngram.id_array[i];
	  ng->ind[i][ng->num_kgrams[i]] = new_index(ng->num_kgrams[i+1]-1,
						    ng->ptr_table[i],
						    &(ng->ptr_table_size[i]),
						    ng->num_kgrams[i]);
	  ng->num_kgrams[i]++;
	
	  if (ng->num_kgrams[i] >= ng->table_sizes[i])
	    quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[i],i+1);	  
	}
      
	for (i=0;i<=pos_of_novelty-1;i++) 
	  ng_count[i] += current_ngram.count;
      
	/* Deal with new 1-grams */
      
	if (pos_of_novelty == 0) {
	  if (ng->n>1) {
	    for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) {
	      ng->ind[0][i] = new_index(ng->num_kgrams[1]-1,
				       ng->ptr_table[0],
				       &(ng->ptr_table_size[0]),
				       i);
	    }
	    prev_id1 = current_ngram.id_array[0];
	  }

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

	  if (!ng->context_cue[previous_ngram.id_array[0]]) {
	    ng->n_unigrams += ng_count[0];
	    store_normal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
	  }

	  store_marginal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
		      
	  ng_count[0] = current_ngram.count;
	}

	if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
	  ng->num_kgrams[ng->n-1]--;

	ngram_copy(&previous_ngram,&current_ngram,ng->n);

      }else {
	if (!displayed_oov_warning){
	  pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n");
	  displayed_oov_warning = 1;
	}
      }
    }
  }

  rr_iclose(ng->id_gram_fp);

  for (i=ng->n-2;i>=1;i--) {

    NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);

    if (ng_count[i] <= ng->cutoffs[i-1]) 
      ng->num_kgrams[i]--;
    else 
      store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);
      
  }
  
  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

  if (!ng->context_cue[current_ngram.id_array[0]]) {
    ng->n_unigrams += ng_count[0];
    store_normal_count(ng,current_ngram.id_array[0],ng_count[0],0);
  }

  store_marginal_count(ng,current_ngram.id_array[0],ng_count[0],0);

  if (ng->n>1) {
    for (i=current_ngram.id_array[0]+1;i<=ng->vocab_size;i++)
      ng->ind[0][i] = new_index(ng->num_kgrams[1],
				ng->ptr_table[0],
				&(ng->ptr_table_size[0]),
				current_ngram.id_array[0]);
  }

  /* The idngram reading is completed at this point */
  pc_message(verbosity,2,"\n");

  /* Impose a minimum unigram count, if required */

  if (ng->min_unicount > 0) {

    int nchanged= 0;

    for (i=ng->first_id;i<=ng->vocab_size;i++) {
      if ((return_count(ng->four_byte_counts,
			ng->count_table[0],
			ng->count[0],
			ng->count4[0],
			i) < ng->min_unicount) && !ng->context_cue[i]) {

	/* There was a bug in V2's switch.  Look at segment for ABSOLUTE */
	NG_DISC_METH(ng)->reduce_ug_freq_of_freq(ng,i);
	ng->n_unigrams += (ng->min_unicount - ng->count[0][i]);
	store_normal_count(ng,i,ng->min_unicount,0);
	nchanged++;
      }
    }

    if (nchanged > 0) 
      pc_message(verbosity,2,
		 "Unigram counts of %d words were bumped up to %d.\n",
		 nchanged,ng->min_unicount);
  }

  /* Count zeroton information for unigrams */

  ng->freq_of_freq[0][0] = 0;
  
  for (i=ng->first_id;i<=ng->vocab_size;i++) {
    if (return_count(ng->four_byte_counts,
		     ng->count_table[0],
		     ng->count[0],
		     ng->count4[0],
		     i) == 0) {
      ng->freq_of_freq[0][0]++;
    }
  }  

  if (ng->discounting_method == GOOD_TURING) {
    for (i=0;i<=ng->n-1;i++) 
      for (j=1;j<=ng->fof_size[i];j++) 
	pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng->freq_of_freq[i][j]);
  }

  pc_message(verbosity,2,"Calculating discounted counts.\n");

  NG_DISC_METH(ng)->compute_discount_aux(ng, verbosity);
     
  /* Smooth unigram distribution, to give some mass to zerotons */     
  compute_unigram(ng,verbosity);

  /* Increment Contexts if using Good-Turing discounting-> No need otherwise,
     since all values are discounted anyway. */

  if (ng->discounting_method == GOOD_TURING) {
    pc_message(verbosity,2,"Incrementing contexts...\n");  

    for (i=ng->n-1;i>=1;i--) 
      increment_context(ng,i,verbosity);      
  }

  /* Calculate back-off weights */

  pc_message(verbosity,2,"Calculating back-off weights...\n");

  for (i=1;i<=ng->n-1;i++) 
    compute_back_off(ng,i,verbosity);

  if (!ng->four_byte_alphas) 
    pc_message(verbosity,3,"Number of out of range alphas = %d\n",
	       ng->size_of_alpha_array);

  /* Write out LM */

  pc_message(verbosity,2,"Writing out language model...\n");

  if (ng->write_arpa)
    write_arpa_lm(ng,verbosity);

  if (ng->write_bin) 
    write_bin_lm(ng,verbosity);

  pc_message(verbosity,0,"idngram2lm : Done.\n");

  return 0;    
}