Пример #1
0
void show_idngram_nlines(int nlines, int verbosity)
{
  if (nlines % 20000 == 0) {
    if (nlines % 1000000 == 0)
      pc_message(verbosity,2,".\n");
    else 
      pc_message(verbosity,2,".");
  }
}
Пример #2
0
void read_vocab(ng_t* ng, int verbosity)
{
  vocab_sz_t test_cc_id;
  vocab_sz_t current_cc_id;
  char current_cc[200];
  char wlist_entry[1024];

  pc_message(verbosity,2,"Reading vocabulary.\n");

  /* Don't change the parameter of sih_create, because it will change
     the binary layout of the .binlm file */

  ng->vocab_ht =
    sih_create(1000,0.5,2.0,1);

  read_voc(ng->vocab_filename,verbosity,ng->vocab_ht,&ng->vocab,&(ng->vocab_size));
  
  /* Determine which of the vocabulary words are context cues */

  ng->no_of_ccs = 0;
  ng->context_cue = (flag *) rr_calloc(ng->vocab_size+1,sizeof(flag));

  if (ng->context_set) {
    /* This should be tied to l889 to l894 in lm_combine.c
     */
    while (fgets (wlist_entry, sizeof (wlist_entry),ng->context_cues_fp)) {
      if (strncmp(wlist_entry,"##",2)==0) continue;
      sscanf (wlist_entry, "%s ",current_cc);
      warn_on_wrong_vocab_comments(wlist_entry);

      if (sih_lookup(ng->vocab_ht,current_cc,&current_cc_id) == 0) 
	pc_message(verbosity,1,"Warning : %s in the context cues file does not appear in the vocabulary.\n",current_cc);
      else {
	ng->context_cue[(unsigned short) current_cc_id] = 1;
	pc_message(verbosity,2,"Context cue word : %s id = %d\n",current_cc,current_cc_id);
	ng->no_of_ccs++;
      }
    }
    rr_iclose(ng->context_cues_fp);
  }

  if ((sih_lookup(ng->vocab_ht,"<s>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <s> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

  if ((sih_lookup(ng->vocab_ht,"<p>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <p> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

  if ((sih_lookup(ng->vocab_ht,"<art>",&test_cc_id) != 0)) 
    if (ng->context_cue[(unsigned short) test_cc_id] == 0) 
      fprintf(stderr,"WARNING: <art> appears as a vocabulary item, but is not labelled as a\ncontext cue.\n");

}
Пример #3
0
/* update hash table contents */
void update( struct hash_table *table, char *key, int verbosity )
{
  int chain;
  
  chain = hash( key, table->size );
  if ( chain < 0 || chain >= table->size ) {
    pc_message(verbosity,1,"WARNING : invalid hash address.\n");
    pc_message(verbosity,1,"%s ignored\n", key );
    return;
  }
  update_chain( table->chain[ chain ], key );

}
Пример #4
0
/**
   Very similar to write_lms except that it use arpa_lm_t but not
   lm_t.  Headers of the two files are also written. 
 */
void write_interpolated_lm(arpa_lm_t *ng, const char* arpa_filename, const char* header1, const char* header2, int verbosity) 
{		
  int i;
  int j;
  FILE* fp;
  TBROWSE br;
  id__t id[MAX_K];
  char** words;
  
  words=(char**)NewArray(MAX_K,MAX_WORD,sizeof(char));
  if (words==NULL) {
    Error ("Cannot allocate memory");
    return;
  }
  
  if ((fp=fopen(arpa_filename,"w"))==NULL) {
    Error ("Cannot open file to write arpa lm file.");
    return ;
  }
	
  /* HEADER */
  
  pc_message(verbosity,1,"ARPA-style %d-gram will be written to %s\n",ng->n,arpa_filename);
  
  write_arpa_copyright(fp,ng->n,(int) ng->vocab_size,ng->vocab[1],ng->vocab[2],ng->vocab[3]);
  write_arpa_format(fp,ng->n);	
  write_arpa_headers(fp, header1, header2);
  write_arpa_num_grams(fp,NULL,ng,1);
 
  /* Print 1-gram, ... n-gram info. */
  
  for (i=0;i<=ng->n-1;i++) {    
    /* Print out the (i+1)-gram */		
    write_arpa_k_gram_header(fp,i+1);
    
    begin_browse(ng,i+1,&br);
    
    /* Go through the n-gram list in order */
    
    while (get_next_ngram(id,&br)) {
      fprintf(fp,"%.4f ",ng->probs[i][br.pos[i]-1]);
      for (j=0;j<=i;j++)
	fprintf(fp,"%s ",ng->vocab[id[j]]);

      if (i <= ng->n-2)
	fprintf(fp,"\t%.4f\n",ng->bo_weight[i][br.pos[i]-1]);
      else
	fprintf(fp,"\n");
    }
  }	
	
  fprintf(fp,"\n\\end\\\n");

  fclose(fp);
	
  DeleteArray(words);
} 
Пример #5
0
void init_ng_table_size(ng_t *ng, 
		        flag mem_alloc_method, 
		        flag is_ascii,
		        int verbosity,
		        int buffer_size
		        )
{
  int middle_size;
  int end_size;

  if (ng->n>1) {
    switch(mem_alloc_method) {

    case TWO_PASSES: 
      ng->table_sizes = (table_size_t *) rr_calloc(ng->n,sizeof(table_size_t));
      pc_message(verbosity,2,"Calculating memory requirement.\n");
      calc_mem_req(ng,is_ascii);
      break;
    case BUFFER:
      ng->table_sizes = (table_size_t *) rr_malloc(ng->n*sizeof(table_size_t));
      middle_size = sizeof(count_ind_t) + sizeof(bo_weight_t) + 
	sizeof(index__t) + sizeof(id__t);
      end_size = sizeof(count_ind_t) + sizeof(id__t);

      if (ng->four_byte_alphas)
	middle_size += 2;

      if (ng->four_byte_counts) {
	middle_size += 2;
	end_size += 2;
      }   

      guess_mem(buffer_size,
		middle_size,
		end_size,
		ng->n,
		ng->table_sizes,
		verbosity);
      break;
    case SPECIFIED:
      break;
    }  
  }else{
    ng->table_sizes = (table_size_t *) rr_calloc(1,sizeof(table_size_t));
  }

  ng->table_sizes[0] = ng->vocab_size+1;

}		
Пример #6
0
int read_vocab(char* vocab_filename, 
	       int verbosity,
	       struct idngram_hash_table* vocabulary,
	       int M
	       )
{
  FILE *vocab_file;
  int vocab_size;
  char temp_word[MAX_WORD_LENGTH];
  char temp_word2[MAX_WORD_LENGTH];

  vocab_size = 0;
  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary... \n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /*    printf("hey hey %s %d\n ", temp_word2, idngram_hash(temp_word2,M));*/

    /* Check for repeated words in the vocabulary */    
    if (index2(vocabulary,temp_word2) != 0)
      warn_on_repeated_words(temp_word2);

    warn_on_wrong_vocab_comments(temp_word);
    vocab_size++;
    /*    printf("%s %d\n ", temp_word2, idngram_hash(temp_word2,M));*/

    add_to_idngram_hashtable(vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size);
    if(vocab_size == M){
      quit(-1, "Number of entries reached the size of the hash.  Run the program again with a larger has size -hash \n");
    }
  }

  if (vocab_size > MAX_VOCAB_SIZE)    
    fprintf(stderr,"text2idngram : vocab_size %d\n is larger than %d\n",vocab_size,MAX_VOCAB_SIZE);

  return 0;
}
Пример #7
0
int text2wfreq_impl(FILE* infp, FILE* outfp, int init_nwords, int verbosity)
{
  int hash_size, scanrc;
  struct hash_table vocab;
  char word[MAX_STRING_LENGTH];

  hash_size = nearest_prime( init_nwords );
  new_hashtable( &vocab, hash_size );
  while( (scanrc = fscanf(infp, "%500s", word )) == 1 ) {
    if ( strlen( word ) >= MAX_STRING_LENGTH ) {
      pc_message(verbosity,1,"text2wfreq : WARNING: word too long, will be split: %s...\n",word);
    }
    if (strlen(word)) {
      update( &vocab, word ,verbosity);
    }
  }
  if ( scanrc != EOF ) {
    quit(-1,"Error reading input\n");
  }

  print( outfp, &vocab );
  return 0;
}
Пример #8
0
void main(int argc, char *argv[]) {

  int verbosity;
  int vocab_size;
  int cutoff;
  int num_recs;
  int current_rec;
  int num_above_threshold;
  int num_to_output;
  int i;
  word_rec *records;
  char temp_word[750];
  flag gt_set;
  flag top_set;

  /* Process command line */

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help")) {
    fprintf(stderr,"wfreq2vocab : Generate a vocabulary file from a word frequency file.\n");
    fprintf(stderr,"Usage : wfreq2vocab [ -top 20000 | -gt 10]\n");
    fprintf(stderr,"                    [ -records %d ]\n",DEFAULT_MAX_RECORDS);
    fprintf(stderr,"                    [ -verbosity %d]\n",DEFAULT_VERBOSITY);
    fprintf(stderr,"                    < .wfreq > .vocab\n");
    exit(1);
  }

  cutoff = pc_intarg( &argc, argv, "-gt",-1);
  vocab_size = pc_intarg(&argc, argv, "-top",-1);
  num_recs = pc_intarg(&argc, argv, "-records",DEFAULT_MAX_RECORDS);
  verbosity = pc_intarg(&argc, argv, "-verbosity",DEFAULT_VERBOSITY);
  
  pc_report_unk_args(&argc,argv,verbosity);

  if (cutoff != -1) {
    gt_set = 1;
  }
  else {
    gt_set = 0;
    cutoff = 0;
  }

  if (vocab_size != -1) {
    top_set = 1;
  }
  else {
    top_set = 0;
    vocab_size = 0;
  }
  
  if (gt_set && top_set) {
    quit(-1,"wfreq2vocab : Error : Can't use both the -top and the -gt options.\n");
  }



  if (!gt_set && !top_set) {
    vocab_size = 20000;
  }

  if (gt_set) {
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing all words which\n              occurred more that %d times. Reading wfreq stream from stdin...\n",cutoff);
  }
  else {
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing the most\n              frequent %d words. Reading wfreq stream from stdin...\n",vocab_size);
  }


  records = (word_rec *) rr_malloc(sizeof(word_rec)*num_recs);

  current_rec = 0;
  num_above_threshold = 0;
  
  while (!rr_feof(stdin)) {

    if (scanf("%s %d",temp_word,&(records[current_rec].count)) != 2) {
      if (!rr_feof(stdin)) {
	quit(-1,"Error reading unigram counts from standard input.\n");
      }
    }
    else {
      records[current_rec].word = salloc(temp_word);
      if (gt_set && records[current_rec].count > cutoff) {
	num_above_threshold++;
      }
      current_rec++;
    }
  }

  /* Sort records in descending order of count */

  qsort((void*) records,(size_t) current_rec, sizeof(word_rec),sort_by_count);

  if (gt_set) {
    num_to_output = num_above_threshold;
  }
  else {
    num_to_output = vocab_size;
  }

  if (current_rec<num_to_output) {
    num_to_output = current_rec;
  }

  /* Now sort the relevant records alphabetically */

  qsort((void*) records,(size_t) num_to_output, sizeof(word_rec),sort_alpha);

  if (gt_set) {
    pc_message(verbosity,2,"Size of vocabulary = %d\n",num_to_output);
  }
  
  if (num_to_output>65535) {
    pc_message(verbosity,1,"Warning : Vocab size exceeds 65535. This will cause problems with \nother tools, since word id's are stored in 2 bytes.\n");
  }

  /* Print the vocab to stdout */
  
  printf("## Vocab generated by v2 of the CMU-Cambridge Statistcal\n");
  printf("## Language Modeling toolkit.\n");
  printf("##\n");
  printf("## Includes %d words ",num_to_output);
  printf("##\n");

  for (i=0;i<=num_to_output-1;i++) {
    printf("%s\n",records[i].word);
  }

  pc_message(verbosity,0,"wfreq2vocab : Done.\n");

  exit(0);

}  
Пример #9
0
void main(int argc, char *argv[]) {

  int i,j;

  char *vocab_filename;
  FILE *tempfile;
  char tempfiles_directory[1000];
  int vocab_size;
  FILE *vocab_file;

  int verbosity;

  int buffer_size;
  int position_in_buffer;
  int number_of_tempfiles;
  int max_files;
  int fof_size;

  unsigned short *buffer;
  unsigned short *placeholder;
  unsigned short *temp_ngram;
  int temp_count;
  
  char temp_word[500];
  char temp_word2[500];

  char *temp_file_root;
  char *temp_file_ext;
  char *host_name;
  int proc_id;
  struct utsname uname_info;

  flag write_ascii;

  /* Vocab hash table things */

  struct hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  tempfile = NULL; /* Just to prevent compilation warnings. */

  report_version(&argc,argv);

  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Process command line */
  
  if (pc_flagarg( &argc, argv,"-help") || argc==1) {
    fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n");
    fprintf(stderr,"Usage : text2idngram  -vocab .vocab \n");
    fprintf(stderr,"                    [ -buffer 100 ]\n");
    fprintf(stderr,"                    [ -hash %d ]\n",DEFAULT_HASH_SIZE);
    fprintf(stderr,"                    [ -temp %s ]\n",DEFAULT_TEMP);
    fprintf(stderr,"                    [ -files %d ]\n",DEFAULT_MAX_FILES);
    fprintf(stderr,"                    [ -gzip | -compress ]\n");
    fprintf(stderr,"                    [ -verbosity %d ]\n",
	    DEFAULT_VERBOSITY);
    fprintf(stderr,"                    [ -n 3 ]\n");
    fprintf(stderr,"                    [ -write_ascii ]\n");
    fprintf(stderr,"                    [ -fof_size 10 ]\n");
    exit(1);
  }

  pc_message(verbosity,2,"text2idngram\n");

  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);

  placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);

  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");

  fof_size = pc_intarg(&argc,argv,"-fof_size",10);

  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);

  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  
  if (!strcmp("",vocab_filename)) {
    quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n");
  }
    
  strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp", 
					   DEFAULT_TEMP));

  if (pc_flagarg(&argc,argv,"-compress")) {
    temp_file_ext = salloc(".Z");
  }
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) {
      temp_file_ext = salloc(".gz");
    }
    else {
      temp_file_ext = salloc("");
    }
  }

  uname(&uname_info);

  host_name = salloc(uname_info.nodename);

  proc_id = getpid();

  sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);

  temp_file_root = salloc(temp_word);

  pc_report_unk_args(&argc,argv,verbosity);
  
  /* If the last charactor in the directory name isn't a / then add one. */
  
  if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') {
    strcat(tempfiles_directory,"/");
  }
  
  pc_message(verbosity,2,"Vocab                  : %s\n",vocab_filename);
  pc_message(verbosity,2,"N-gram buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size        : %d\n",hash_size);
  pc_message(verbosity,2,"Temp directory         : %s\n",tempfiles_directory);
  pc_message(verbosity,2,"Max open files         : %d\n",max_files);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  
  pc_message(verbosity,2,"n                      : %d\n",n);

  buffer_size *= (1000000/(sizeof(unsigned short)*n));

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) {
      fprintf(stderr,"======================================================\n");
      fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word);
      fprintf(stderr,"=======================================================\n");
    }
    if (strncmp(temp_word,"#",1)==0) {
      fprintf(stderr,"\n\n===========================================================\n");
      fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
      fprintf(stderr,     ">>> %s <<<\n",temp_word);
      fprintf(stderr,     "         '%s' will be included in the vocabulary.\n",temp_word2);
      fprintf(stderr,     "         (comments must start with '##')\n");
      fprintf(stderr,"===========================================================\n\n");
    }
    vocab_size++;
    add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size);
  }

  if (vocab_size > MAX_VOCAB_SIZE) {
    quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n");
  }   
  
  pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n");

  buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short));

  number_of_tempfiles = 0;

  /* Read text into buffer */

  /* Read in the first ngram */

  position_in_buffer = 0;

  for (i=0;i<=n-1;i++) {
    get_word(stdin,temp_word);
    add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(stdin)) {

    /* Fill up the buffer */

    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
    while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) {
      position_in_buffer++;
      if (position_in_buffer % 20000 == 0) {
	if (position_in_buffer % 1000000 == 0) {
	  pc_message(verbosity,2,".\n");
	}
	else {
	  pc_message(verbosity,2,".");
	}
      }
      for (i=1;i<=n-1;i++) {
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      }
      if (get_word(stdin,temp_word) == 1) {
	add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) {
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);
    }

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");
    
    qsort((void*) buffer,(size_t) position_in_buffer,
	  n*sizeof(unsigned short),compare_ngrams);

    /* Output the buffer to temporary BINARY file */
    
    number_of_tempfiles++;

    sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    tempfile = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
      if (temp_ngram[i] > MAX_VOCAB_SIZE) {
	quit(-1,"Invalid trigram in buffer.\nAborting");

      }
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {
 
      if (!compare_ngrams(temp_ngram,&buffer[i*n])) {
	temp_count++;
      }
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1,
		    tempfile,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite(&temp_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	temp_count = 1;
      }
    }
    
    rr_oclose(tempfile);

    for (i=0;i<=n-1;i++) {
      add_to_buffer(placeholder[i],0,i,buffer);
    }

    position_in_buffer = 0;

  }

  /* Merge the temporary files, and output the result to standard output */

  pc_message(verbosity,2,"Merging temporary files...\n");
  
  merge_tempfiles(1,
		  number_of_tempfiles,
		  temp_file_root,
		  temp_file_ext,
		  max_files,
		  tempfiles_directory,
		  stdout,
		  write_ascii,
		  fof_size); 

  pc_message(verbosity,0,"text2idngram : Done.\n");

  exit(0);
  
}
Пример #10
0
int main(int argc, char **argv) {

  int i,j;
  ng_t* ng;
  int verbosity;
  int mem_alloc_method; /* Method used to decide how much memory to 
			   allocate for count tables */
  int buffer_size;
  flag is_ascii;
  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count; /* Array indicating the number of occurrances of 
			   the current 1-gram, 2-gram, ... ,n-gram 
			   Size depends on #define in general.h
			*/  
  int nlines;
  int pos_of_novelty;
  int prev_id1;
  flag contains_unks;
  int mem_alloced;

  flag displayed_oov_warning; /** Display OOV warning 
			       */

  /*  ------------------  Process command line --------------------- */

  report_version(&argc,argv);

  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }

  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Initialization */
  {
    ng=init_ng(
	    &argc,
	    argv,
	    verbosity
	    );
    
    mem_alloc_method = init_alloc_method(ng, &argc, argv, &buffer_size);
    
    if (!strcmp(ng->id_gram_filename,"-") && mem_alloc_method == TWO_PASSES)
      quit(-1,"Error: If idngram is read from stdin, then cannot use -calc_mem option.\n");
    
    is_ascii = set_lmformat(pc_flagarg(&argc,argv,"-ascii_input"),
			    pc_flagarg(&argc,argv,"-bin_input"),
			    ng);  

    /* Report parameters */
    report_param(verbosity,ng,
		 is_ascii, mem_alloc_method, buffer_size);

    pc_report_unk_args(&argc,argv,verbosity);

  }

  /* --------------- Read in the vocabulary -------------- */
  read_vocab(ng,verbosity);
       		     
  /* --------------- Allocate space for the table_size array --------- */
  init_ng_table_size(ng, 
		     mem_alloc_method,
		     is_ascii,
		     verbosity,
		     buffer_size
		     );

  /* ----------- Allocate memory for tree structure -------------- */

  ng->count = NULL;
  ng->count4 = NULL;
  ng->marg_counts = NULL;
  ng->marg_counts4 = NULL;
  ng->count_table = NULL;

  ng->count = (count_ind_t **) rr_malloc(sizeof(count_ind_t *)*ng->n);
  ng->count4 = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);    
  ng->count_table = (count_t **) rr_malloc(sizeof(count_t *)*ng->n);

  if (ng->four_byte_counts) {
    ng->marg_counts4 = (count_t *) rr_calloc(sizeof(count_t), ng->table_sizes[0]);

  }else {
    for (i=0;i<=ng->n-1;i++) 
      ng->count_table[i] = (count_t *) rr_calloc(ng->count_table_size+1,
						sizeof(count_t));

    ng->marg_counts = (count_ind_t *) rr_calloc(sizeof(count_ind_t),ng->table_sizes[0]);
    fprintf(stderr, "table_size %d\n",ng->table_sizes[0]);
    fflush(stderr);
  }

  ng->word_id = (id__t **) rr_malloc(sizeof(id__t *)*ng->n);

  if (ng->four_byte_alphas) {
    ng->bo_weight4 = (four_byte_t **) rr_malloc(sizeof(four_byte_t *)*ng->n);
    ng->bo_weight4[0] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*
						ng->table_sizes[0]);
  }else {
    ng->bo_weight = (bo_weight_t **) rr_malloc(sizeof(bo_weight_t *)*ng->n);
    ng->bo_weight[0] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*
						ng->table_sizes[0]);
  }

  ng->ind = (index__t **)  rr_malloc(sizeof(index__t *)*ng->n);

  /* First table */
  if (ng->four_byte_counts) 
    ng->count4[0] = (count_t *) rr_calloc(ng->table_sizes[0],sizeof(count_t));
  else 
    ng->count[0] = (count_ind_t *) rr_calloc(ng->table_sizes[0],sizeof(count_ind_t));

  ng->uni_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					   ng->table_sizes[0]);
  ng->uni_log_probs = (uni_probs_t *) rr_malloc(sizeof(uni_probs_t)*
					       ng->table_sizes[0]);

  if (ng->n >=2) 
    ng->ind[0] = (index__t *) rr_calloc(ng->table_sizes[0],sizeof(index__t));

  for (i=1;i<=ng->n-2;i++) {    
    ng->word_id[i] = (id__t *) rr_malloc(sizeof(id__t)*ng->table_sizes[i]);

    if (ng->four_byte_counts) 
      ng->count4[i] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[i]);
    else 
      ng->count[i] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[i]);

    if (ng->four_byte_alphas) 
      ng->bo_weight4[i] = (four_byte_t *) rr_malloc(sizeof(four_byte_t)*ng->table_sizes[i]);
    else 
      ng->bo_weight[i] = (bo_weight_t *) rr_malloc(sizeof(bo_weight_t)*ng->table_sizes[i]);
    
    ng->ind[i] = (index__t *) rr_malloc(sizeof(index__t)*ng->table_sizes[i]);

    mem_alloced = sizeof(count_ind_t) + sizeof(bo_weight_t) + 
		sizeof(index__t) + sizeof(id__t);
    
    if (ng->four_byte_alphas) 
      mem_alloced += 4;
   
    mem_alloced *= ng->table_sizes[i];
    
    pc_message(verbosity,2,"Allocated %d bytes to table for %d-grams.\n",
	       mem_alloced,i+1);
    
  }

  ng->word_id[ng->n-1] = (id__t *) 
    rr_malloc(sizeof(id__t)*ng->table_sizes[ng->n-1]);

  if (ng->four_byte_counts) 
    ng->count4[ng->n-1] = (count_t *) rr_malloc(sizeof(count_t)*ng->table_sizes[ng->n-1]);    
  else 
    ng->count[ng->n-1] = (count_ind_t *) rr_malloc(sizeof(count_ind_t)*ng->table_sizes[ng->n-1]);

  pc_message(verbosity,2,"Allocated (%d+%d) bytes to table for %d-grams.\n",
	     ng->four_byte_counts?sizeof(count_t):sizeof(count_ind_t),
	     sizeof(id__t)*ng->table_sizes[ng->n-1],ng->n);
  
  /* Allocate memory for table for first-byte of indices */

  ng_allocate_ptr_table(ng,NULL,0);

  /* Allocate memory for alpha array */

  ng->alpha_array = (double *) rr_malloc(sizeof(double)*ng->out_of_range_alphas);
  ng->size_of_alpha_array = 0;

  /* Allocate memory for frequency of frequency information */

  ng->freq_of_freq = (fof_t **) rr_malloc(sizeof(fof_t *)*ng->n);

  NG_DISC_METH(ng)->allocate_freq_of_freq(ng);

  /* Read n-grams into the tree */
  pc_message(verbosity,2,"Processing id n-gram file.\n");
  pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

  /* Allocate space for ngrams id arrays */

  current_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  previous_ngram.id_array = (id__t *) rr_calloc(ng->n,sizeof(id__t));
  current_ngram.n = ng->n;
  previous_ngram.n = ng->n;
  
  ng->num_kgrams = (ngram_sz_t *) rr_calloc(ng->n,sizeof(ngram_sz_t));
  ng_count = (count_t *) rr_calloc(ng->n,sizeof(count_t));
  nlines = 1;
  ng->n_unigrams = 0;

  /* Process first n-gram */  
  get_ngram(ng->id_gram_fp,&current_ngram,is_ascii);
  contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);

  /* Skip over any unknown words.  They will come first, because <UNK>
     always has a word ID of zero. */
  while (ng->vocab_type == CLOSED_VOCAB && contains_unks){
    /* Stop looking if there are no more N-Grams.  Of course, this
       means training will fail, since there are no unigrams. */
    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii) == 0)
      break;
    contains_unks = ngram_chk_contains_unks(&current_ngram,ng->n);
  }

  for (i=0;i<=ng->n-2;i++) {
    ng->ind[i][0] = new_index(0,ng->ptr_table[i],&(ng->ptr_table_size[i]),0);
    ng->word_id[i+1][0] = current_ngram.id_array[i+1];
    ng->num_kgrams[i+1]++;
    ng_count[i] = current_ngram.count;
  }

  ng_count[0] = current_ngram.count;

  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);

  store_normal_count(ng,0,current_ngram.count,ng->n-1);

  if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
    ng->num_kgrams[ng->n-1]--;

  ngram_copy(&previous_ngram,&current_ngram,ng->n);

  prev_id1 = current_ngram.id_array[0];
    
  displayed_oov_warning = 0;

  while (!rr_feof(ng->id_gram_fp)) {

    if (get_ngram(ng->id_gram_fp,&current_ngram,is_ascii)) {

      if (ng->vocab_type == CLOSED_VOCAB)
	contains_unks=ngram_chk_contains_unks(&current_ngram,ng->n);
    
      if (!contains_unks || ng->vocab_type != CLOSED_VOCAB) {

	/* Test for where this ngram differs from last - do we have an
	   out-of-order ngram? */
	pos_of_novelty = ngram_find_pos_of_novelty(&current_ngram,&previous_ngram,ng->n,nlines);
    
	nlines++; 
	show_idngram_nlines(nlines, verbosity);
    
	/* Add new n-gram as soon as it is encountered */
	/* If all of the positions 2,3,...,n of the n-gram are context
	   cues then ignore the n-gram. */
    
	if (ng->n > 1) {
	  NG_DISC_METH(ng)->update_freq_of_freq(ng,ng->n-1,current_ngram.count);
	        
	  store_normal_count(ng,ng->num_kgrams[ng->n-1],current_ngram.count,ng->n-1);
	  
	  ng->word_id[ng->n-1][ng->num_kgrams[ng->n-1]] = current_ngram.id_array[ng->n-1];
	  ng->num_kgrams[ng->n-1]++;	  
	  
	  if (ng->num_kgrams[ng->n-1] >= ng->table_sizes[ng->n-1])
	    quit(-1,"\nMore than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[ng->n-1],ng->n);
	}
	/* Deal with new 2,3,...,(n-1)-grams */
      
	for (i=ng->n-2;i>=MAX(1,pos_of_novelty);i--) {

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);
	  
	  if (ng_count[i] <= ng->cutoffs[i-1]) 
	    ng->num_kgrams[i]--;
	  else
	    store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);

	  ng_count[i] = current_ngram.count;
	  ng->word_id[i][ng->num_kgrams[i]] = current_ngram.id_array[i];
	  ng->ind[i][ng->num_kgrams[i]] = new_index(ng->num_kgrams[i+1]-1,
						    ng->ptr_table[i],
						    &(ng->ptr_table_size[i]),
						    ng->num_kgrams[i]);
	  ng->num_kgrams[i]++;
	
	  if (ng->num_kgrams[i] >= ng->table_sizes[i])
	    quit(-1,"More than %d %d-grams needed to be stored. Rerun with a higher table size.\n",ng->table_sizes[i],i+1);	  
	}
      
	for (i=0;i<=pos_of_novelty-1;i++) 
	  ng_count[i] += current_ngram.count;
      
	/* Deal with new 1-grams */
      
	if (pos_of_novelty == 0) {
	  if (ng->n>1) {
	    for (i = prev_id1 + 1; i <= current_ngram.id_array[0]; i++) {
	      ng->ind[0][i] = new_index(ng->num_kgrams[1]-1,
				       ng->ptr_table[0],
				       &(ng->ptr_table_size[0]),
				       i);
	    }
	    prev_id1 = current_ngram.id_array[0];
	  }

	  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

	  if (!ng->context_cue[previous_ngram.id_array[0]]) {
	    ng->n_unigrams += ng_count[0];
	    store_normal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
	  }

	  store_marginal_count(ng,previous_ngram.id_array[0],ng_count[0],0);
		      
	  ng_count[0] = current_ngram.count;
	}

	if (current_ngram.count <= ng->cutoffs[ng->n-2]) 
	  ng->num_kgrams[ng->n-1]--;

	ngram_copy(&previous_ngram,&current_ngram,ng->n);

      }else {
	if (!displayed_oov_warning){
	  pc_message(verbosity,2,"Warning : id n-gram stream contains OOV's (n-grams will be ignored).\n");
	  displayed_oov_warning = 1;
	}
      }
    }
  }

  rr_iclose(ng->id_gram_fp);

  for (i=ng->n-2;i>=1;i--) {

    NG_DISC_METH(ng)->update_freq_of_freq(ng,i,ng_count[i]);

    if (ng_count[i] <= ng->cutoffs[i-1]) 
      ng->num_kgrams[i]--;
    else 
      store_normal_count(ng,ng->num_kgrams[i]-1,ng_count[i],i);
      
  }
  
  NG_DISC_METH(ng)->update_freq_of_freq(ng,0,ng_count[0]);

  if (!ng->context_cue[current_ngram.id_array[0]]) {
    ng->n_unigrams += ng_count[0];
    store_normal_count(ng,current_ngram.id_array[0],ng_count[0],0);
  }

  store_marginal_count(ng,current_ngram.id_array[0],ng_count[0],0);

  if (ng->n>1) {
    for (i=current_ngram.id_array[0]+1;i<=ng->vocab_size;i++)
      ng->ind[0][i] = new_index(ng->num_kgrams[1],
				ng->ptr_table[0],
				&(ng->ptr_table_size[0]),
				current_ngram.id_array[0]);
  }

  /* The idngram reading is completed at this point */
  pc_message(verbosity,2,"\n");

  /* Impose a minimum unigram count, if required */

  if (ng->min_unicount > 0) {

    int nchanged= 0;

    for (i=ng->first_id;i<=ng->vocab_size;i++) {
      if ((return_count(ng->four_byte_counts,
			ng->count_table[0],
			ng->count[0],
			ng->count4[0],
			i) < ng->min_unicount) && !ng->context_cue[i]) {

	/* There was a bug in V2's switch.  Look at segment for ABSOLUTE */
	NG_DISC_METH(ng)->reduce_ug_freq_of_freq(ng,i);
	ng->n_unigrams += (ng->min_unicount - ng->count[0][i]);
	store_normal_count(ng,i,ng->min_unicount,0);
	nchanged++;
      }
    }

    if (nchanged > 0) 
      pc_message(verbosity,2,
		 "Unigram counts of %d words were bumped up to %d.\n",
		 nchanged,ng->min_unicount);
  }

  /* Count zeroton information for unigrams */

  ng->freq_of_freq[0][0] = 0;
  
  for (i=ng->first_id;i<=ng->vocab_size;i++) {
    if (return_count(ng->four_byte_counts,
		     ng->count_table[0],
		     ng->count[0],
		     ng->count4[0],
		     i) == 0) {
      ng->freq_of_freq[0][0]++;
    }
  }  

  if (ng->discounting_method == GOOD_TURING) {
    for (i=0;i<=ng->n-1;i++) 
      for (j=1;j<=ng->fof_size[i];j++) 
	pc_message(verbosity,3,"fof[%d][%d] = %d\n",i,j,ng->freq_of_freq[i][j]);
  }

  pc_message(verbosity,2,"Calculating discounted counts.\n");

  NG_DISC_METH(ng)->compute_discount_aux(ng, verbosity);
     
  /* Smooth unigram distribution, to give some mass to zerotons */     
  compute_unigram(ng,verbosity);

  /* Increment Contexts if using Good-Turing discounting-> No need otherwise,
     since all values are discounted anyway. */

  if (ng->discounting_method == GOOD_TURING) {
    pc_message(verbosity,2,"Incrementing contexts...\n");  

    for (i=ng->n-1;i>=1;i--) 
      increment_context(ng,i,verbosity);      
  }

  /* Calculate back-off weights */

  pc_message(verbosity,2,"Calculating back-off weights...\n");

  for (i=1;i<=ng->n-1;i++) 
    compute_back_off(ng,i,verbosity);

  if (!ng->four_byte_alphas) 
    pc_message(verbosity,3,"Number of out of range alphas = %d\n",
	       ng->size_of_alpha_array);

  /* Write out LM */

  pc_message(verbosity,2,"Writing out language model...\n");

  if (ng->write_arpa)
    write_arpa_lm(ng,verbosity);

  if (ng->write_bin) 
    write_bin_lm(ng,verbosity);

  pc_message(verbosity,0,"idngram2lm : Done.\n");

  return 0;    
}
Пример #11
0
int main (int argc, char **argv) {

  int n;
  int verbosity;
  int max_files;
  int max_words;
  int max_chars;

  int current_word;
  int current_char;
  int start_char;		/* start boundary (possibly > than 0) */

  int no_of_spaces;
  int pos_in_string;

  int i;
  char *current_string;
  char current_temp_filename[500];
  int current_file_number;
  FILE *temp_file;

  flag text_buffer_full;

  char *text_buffer;
  char **pointers;

  char current_ngram[500];
  int current_count;

  int counter;

  char temp_directory[1000];
  char *temp_file_ext;

  flag words_set;
  flag chars_set;

  /* Process command line */

  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);
  pc_message(verbosity,2,"text2wngram\n");

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help")) {
    help_message();
    exit(1);
  }

  n = pc_intarg(&argc, argv,"-n",DEFAULT_N);

  /*  max_words = pc_intarg(&argc, argv,"-words",STD_MEM*1000000/11);
  max_chars = pc_intarg(&argc, argv,"-chars",STD_MEM*7000000/11); */

  max_words = pc_intarg(&argc, argv,"-words",-1);
  max_chars = pc_intarg(&argc, argv,"-chars",-1);

  if (max_words == -1) {
    words_set = 0;
    max_words = STD_MEM*1000000/11;
  }else
    words_set = 1;

  if (max_chars == -1) {
    chars_set = 0;
    max_chars = STD_MEM*7000000/11; 
  }else
    chars_set = 1;
  
  max_files = pc_intarg(&argc, argv,"-files",DEFAULT_MAX_FILES);

  if (pc_flagarg(&argc,argv,"-compress"))
    temp_file_ext = salloc(".Z");
  else {
    if (pc_flagarg(&argc,argv,"-gzip"))
      temp_file_ext = salloc(".gz");
    else
      temp_file_ext = salloc("");
  }

  strcpy(temp_directory, "cmuclmtk-XXXXXX");
  if (mkdtemp(temp_directory) == NULL) {
     quit(-1, "Failed to create temporary folder: %s\n", strerror(errno));
  }

  pc_report_unk_args(&argc,argv,verbosity);
 
  if (words_set && !chars_set)
    max_chars = max_words * 7;

  if (!words_set && chars_set)
    max_words = max_chars / 7;

  /* If the last charactor in the directory name isn't a / then add one. */
  
  pc_message(verbosity,2,"n = %d\n",n);
  pc_message(verbosity,2,"Number of words in buffer = %d\n",max_words);
  pc_message(verbosity,2,"Number of chars in buffer = %d\n",max_chars);
  pc_message(verbosity,2,"Max number of files open at once = %d\n",max_files);
  pc_message(verbosity,2,"Temporary directory = %s\n",temp_directory);

  /* Allocate memory for the buffers */

  text_buffer = (char *) rr_malloc(sizeof(char)*max_chars);
  pc_message(verbosity,2,"Allocated %d bytes to text buffer.\n",
	     sizeof(char)*max_chars);

  pointers = (char **) rr_malloc(sizeof(char *)*max_words);
  pc_message(verbosity,2,"Allocated %d bytes to pointer array.\n",
	     sizeof(char *)*max_words);

  current_file_number = 0;

  current_word = 1;
  start_char = 0;
  current_char = 0;
  counter = 0;
  pointers[0] = text_buffer;
      
  while (!feof(stdin)) {

    current_file_number++;

    /* Read text into buffer */
    
    pc_message(verbosity,2,"Reading text into buffer...\n");

    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 words processed for each \".\", 1,000,000 for each line.\n");
    
    pointers[0] = text_buffer;
    
    while ((!rr_feof(stdin)) && 
	   (current_word < max_words) && 
	   (current_char < max_chars)) {

      text_buffer[current_char] = getchar();
      if (text_buffer[current_char] == '\n' || 
	  text_buffer[current_char] == '\t' ) {
	text_buffer[current_char] = ' ';
      }
      if (text_buffer[current_char] == ' ') {
	if (current_char > start_char) {
	  if (text_buffer[current_char-1] == ' ') {
	    current_word--;
	    current_char--;
	  }
	  pointers[current_word] = &(text_buffer[current_char+1]);
	  current_word++; 
	  counter++;
	  if (counter % 20000 == 0) {
	    if (counter % 1000000 == 0)
	      pc_message(verbosity,2,"\n");
	    else
	      pc_message(verbosity,2,".");
	  }
	}
      }
      
      if (text_buffer[current_char] != ' ' || current_char > start_char) 
	current_char++;
    }

    text_buffer[current_char]='\0';


    if (current_word == max_words || rr_feof(stdin)) {
      for (i=current_char+1;i<=max_chars-1;i++)
	text_buffer[i] = ' ';

      text_buffer_full = 0;
    }else
      text_buffer_full = 1;
    
    /* Sort buffer */

    pc_message(verbosity,2,"\nSorting pointer array...\n"); 

    qsort((void *) pointers,(size_t) current_word-n,sizeof(char *),cmp_strings);
   
    /* Write out temporary file */

    sprintf(current_temp_filename,"%s/%hu%s",temp_directory, current_file_number, temp_file_ext);

    pc_message(verbosity,2,"Writing out temporary file %s...\n",current_temp_filename);
        
    temp_file = rr_oopen(current_temp_filename);
    text_buffer[current_char] = ' ';
    
    current_count = 0;
    strcpy(current_ngram,"");
    
    for (i = 0; i <= current_word-n; i++) {
      current_string = pointers[i];
      
      /* Find the nth space */

      no_of_spaces = 0;
      pos_in_string = 0;
      while (no_of_spaces < n) {	
	if (current_string[pos_in_string] == ' ')
	  no_of_spaces++;

	pos_in_string++;
      }
      
      if (!strncmp(current_string,current_ngram,pos_in_string))
	current_count++;
      else {
	if (strcmp(current_ngram,""))
	  if (fprintf(temp_file,"%s %d\n",current_ngram,current_count) < 0) 
	    quit(-1,"Error writing to temporary file %s\n",current_temp_filename);

	current_count = 1;
	strncpy(current_ngram,current_string,pos_in_string);
	current_ngram[pos_in_string] = '\0';
      }
    }
    
    rr_oclose(temp_file);

    /* Move the last n-1 words to the beginning of the buffer, and set
       correct current_word and current_char things */

    strcpy(text_buffer,pointers[current_word-n]);
    pointers[0]=text_buffer;
   
    /* Find the (n-1)th space */

    no_of_spaces=0;
    pos_in_string=0;

    if (!text_buffer_full){ 
      while (no_of_spaces<(n-1)) {
	if (pointers[0][pos_in_string]==' ') {
	  no_of_spaces++;
	  pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
	}
	pos_in_string++;
      }
    }else {
      while (no_of_spaces<n) {
	if (pointers[0][pos_in_string]==' ') {
	  no_of_spaces++;
	  pointers[no_of_spaces] = &pointers[0][pos_in_string+1];
	}
	pos_in_string++;
      }
      pos_in_string--;
    }

    current_char = pos_in_string;
    current_word = n;
    /* mark boundary beyond which counting pass cannot backup */
    start_char = current_char;

  }
  /* Merge temporary files */

  pc_message(verbosity,2,"Merging temporary files...\n");

  merge_tempfiles(1,
		  current_file_number,
		  temp_directory,
		  temp_file_ext,
		  max_files,
		  stdout,
		  n,
		  verbosity); 

  rmdir(temp_directory);
  pc_message(verbosity,0,"text2wngram : Done.\n");
  
  return 0;
}
Пример #12
0
int oe_03_main (int argc, char **argv) {

  flag first_ngram;
  int n;
  fof_sz_t fof_size;
  flag is_ascii;
  int verbosity;
  fof_t **fof_array;
  ngram_sz_t *num_kgrams;
  ngram current_ngram;
  ngram previous_ngram;
  count_t *ng_count;
  int pos_of_novelty;
  int nlines;
  int i;

  report_version(&argc,argv);

  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {
    oe_04_help_message();
    exit(1);
  }

  is_ascii = pc_flagarg(&argc, argv,"-ascii_input");
  n = pc_intarg(&argc, argv,"-n",3);
  fof_size = pc_intarg(&argc, argv,"-fof_size",50);
  verbosity = pc_intarg(&argc, argv,"-verbosity",DEFAULT_VERBOSITY);

  pc_report_unk_args(&argc,argv,verbosity);

  pc_message(verbosity,2,"n        = %d\n",n);
  pc_message(verbosity,2,"fof_size = %d\n",fof_size);

  current_ngram.n = n;
  previous_ngram.n = n;
  pos_of_novelty = n;
  
  fof_array = (fof_t **) rr_malloc(sizeof(fof_t *) * (n-1));
  for (i=0;i<=n-2;i++) 
    fof_array[i] = (fof_t *) rr_calloc(fof_size+1,sizeof(fof_t));

  num_kgrams = (ngram_sz_t *) rr_calloc(n-1,sizeof(ngram_sz_t));
  ng_count = (count_t *) rr_calloc(n-1,sizeof(count_t));

  current_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));
  previous_ngram.id_array = (id__t *) rr_calloc(n,sizeof(id__t));

  pc_message(verbosity,2,"Processing id n-gram file.\n");
  pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

  nlines = 0;
  first_ngram = 1;
  
  while (!rr_feof(stdin)) {
    
    if (!first_ngram)
      ngram_copy(&previous_ngram,&current_ngram,n);

    if (get_ngram(stdin,&current_ngram,is_ascii)) {

      nlines++;
      show_idngram_nlines(nlines, verbosity);
    
      /* Test for where this ngram differs from last - do we have an
	 out-of-order ngram? */
    
      if (!first_ngram)
        pos_of_novelty = ngram_find_pos_of_novelty(&current_ngram,&previous_ngram,n,nlines);
      else
        pos_of_novelty = 0;

      /* Add new N-gram */
     
      num_kgrams[n-2]++;
      if (current_ngram.count <= fof_size) 
	fof_array[n-2][current_ngram.count]++;

      if (!first_ngram) {
	for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
	  num_kgrams[i-1]++;
	  if (ng_count[i-1] <= fof_size) 
	    fof_array[i-1][ng_count[i-1]]++;
	  
	  ng_count[i-1] = current_ngram.count;
	}
      } else {
	for (i=n-2;i>=MAX(1,pos_of_novelty);i--) 
	  ng_count[i-1] = current_ngram.count;
      }
	
      for (i=0;i<=pos_of_novelty-2;i++) 
	ng_count[i] += current_ngram.count;
	
      if (first_ngram)
        first_ngram = 0;
    }
  }

  /* Process last ngram */

  for (i=n-2;i>=MAX(1,pos_of_novelty);i--) {
    num_kgrams[i-1]++;
    if (ng_count[i-1] <= fof_size) {
      fof_array[i-1][ng_count[i-1]]++;
    }
    ng_count[i-1] = current_ngram.count;
  }
 #import "OpenEarsStaticAnalysisToggle.h"
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__ 
  for (i=0;i<=pos_of_novelty-2;i++)
    ng_count[i] += current_ngram.count;

  display_fof_array(num_kgrams,fof_array,fof_size,stderr, n);
#endif
  pc_message(verbosity,0,"idngram2stats : Done.\n");

  exit(0);
  
}
Пример #13
0
int main(int argc, char *argv[]) {

  int verbosity;
  int vocab_size;
  FILE *vocab_file;
  int buffer_size;
  flag write_ascii;
  int max_files;
  int number_of_tempfiles;
  char *vocab_filename;
  char *idngram_filename;
  char temp_word[MAX_WORD_LENGTH];
  char temp_word2[MAX_WORD_LENGTH];
  char temp_word3[MAX_WORD_LENGTH];
  flag contains_unks;
  int position_in_buffer;
  FILE *outfile;
  FILE *tempfile;
  FILE *non_unk_fp;
  ngram_rec *buffer;
  flag same_ngram;
  int i;
  int j;
  int fof_size;
  int size_of_rec;

  char temp_directory[1000];
  char *temp_file_ext;

  /* Vocab hash table things */

  struct idngram_hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  wordid_t *current_ngram;
  int current_count;
  wordid_t *sort_ngram;
  int sort_count;
  
  /* Process command line */

  report_version(&argc,argv);
  
  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }


  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);
  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");
  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);
  fof_size = pc_intarg(&argc,argv,"-fof_size",10);
  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  idngram_filename = salloc(pc_stringarg( &argc, argv, "-idngram", "" ));
  
  if (!strcmp("",vocab_filename)) 
    quit(-1,"Error : Must specify a vocabulary file.\n");

  if (!strcmp("",idngram_filename)) 
    quit(-1,"text2idngram : Error : Must specify idngram file.\n");
    
  if (pc_flagarg(&argc,argv,"-compress")) 
    temp_file_ext = salloc(".Z");
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) 
      temp_file_ext = salloc(".gz");
    else 
      temp_file_ext = salloc("");
  }

  strcpy(temp_directory, "cmuclmtk-XXXXXX");
  if (mkdtemp(temp_directory) == NULL) {
     quit(-1, "Failed to create temporary folder: %s\n", strerror(errno));
  }

  pc_report_unk_args(&argc,argv,verbosity);

  outfile = rr_fopen(idngram_filename,"wb");
  
  pc_message(verbosity,2,"Vocab           : %s\n",vocab_filename);
  pc_message(verbosity,2,"Output idngram  : %s\n",idngram_filename);
  pc_message(verbosity,2,"Buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size : %d\n",hash_size);
  pc_message(verbosity,2,"Max open files  : %d\n",max_files);
  pc_message(verbosity,2,"n               : %d\n",n);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  

  size_of_rec = (sizeof(wordid_t) * n) + 16 - (( n* sizeof(wordid_t)) % 16);
  buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec)));
  fprintf(stderr,"buffer size = %d\n",buffer_size);

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_idngram_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for vocabulary order */
    if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) 
      quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n");

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) 
      warn_on_repeated_words(temp_word);

    warn_on_wrong_vocab_comments(temp_word);

    vocab_size++;
    
    add_to_idngram_hashtable(&vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size);
    strcpy(temp_word3,temp_word2);
  }

  if (vocab_size > MAX_VOCAB_SIZE) 
    quit(-1,"Error : Vocabulary size exceeds maximum.\n");
  
  pc_message(verbosity,2,"Allocating memory for the buffer...\n");

  buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec));
  
  for (i=0;i<=buffer_size;i++) 
    buffer[i].word = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Open the "non-OOV" tempfile */

  sprintf(temp_word, "%s/1%s", temp_directory, temp_file_ext);
  
  non_unk_fp = rr_fopen(temp_word,"w");

  pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n",
	     temp_word);
  number_of_tempfiles = 1;

  current_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));
  sort_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Read text into buffer */
  position_in_buffer = 0;

  while (!rr_feof(stdin)) {
    
    for (i=0;i<=n-1;i++) {
      get_word(stdin,temp_word);
      current_ngram[i]=index2(&vocabulary,temp_word);
    }
    if (scanf("%d",&current_count) != 1) 
      if (!rr_feof(stdin)) 
	quit(-1,"Error reading n-gram count from stdin.\n");

    if (!rr_feof(stdin)) {

      contains_unks = 0;
      for (i=0;i<=n-1;i++) {
	if (!current_ngram[i]) 
	  contains_unks = 1;
      }

      if (contains_unks) {
	/* Write to buffer */

	position_in_buffer++;

	if (position_in_buffer >= buffer_size) {

	  /* Sort buffer */
	  pc_message(verbosity,2,
		     "Sorting n-grams which include an OOV word...\n");

	  qsort((void*) buffer,(size_t) position_in_buffer,
		sizeof(ngram_rec),compare_ngrams2);

	  pc_message(verbosity,2,"Done.\n");

	  /* Write buffer to temporary file */

	  number_of_tempfiles++;
	  
	  sprintf(temp_word,"%s/%hu%s", temp_directory,
		  number_of_tempfiles,temp_file_ext);
	  
	  pc_message(verbosity,2,
		     "Writing sorted OOV-counts buffer to temporary file %s\n",
		     temp_word);

	  tempfile = rr_fopen(temp_word,"w");
	  
	  for (i=0;i<=n-1;i++) 
	    sort_ngram[i] = buffer[0].word[i];

	  sort_count = buffer[0].count;

	  for (i=0;i<=position_in_buffer-2;i++) {
	    
	    same_ngram = 1;
	    for (j=n-1;j>=0;j--) {
	      if (buffer[i].word[j] != sort_ngram[j]) {
		same_ngram = 0;
		j = -1;
	      }
	    }

	    if (same_ngram) 
	      sort_count += buffer[i].count;
	    else {
	      for (j=0;j<=n-1;j++) {
		rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
			  tempfile,"temporary n-gram ids");
		sort_ngram[j] = buffer[i].word[j];
	      }
	      rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
			"temporary n-gram counts");
	      sort_count = buffer[i].count;
	    }
	  }	    
	  for (j=0;j<=n-1;j++) 
	    rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		      tempfile,"temporary n-gram ids");

	  rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		    "temporary n-gram counts");
	  rr_oclose(tempfile);
	  position_in_buffer = 1;

	}
	
	for (i=0;i<=n-1;i++) 
	  buffer[position_in_buffer-1].word[i] = current_ngram[i];

	buffer[position_in_buffer-1].count = current_count;

      }else {
	/* Write to temporary file */
	for (i=0;i<=n-1;i++) 
	  rr_fwrite((char*)&current_ngram[i],sizeof(wordid_t),1,
		    non_unk_fp,"temporary n-gram ids");

	rr_fwrite((char*)&current_count,sizeof(int),1,non_unk_fp,
		  "temporary n-gram counts");
      }
    }
  }

  if (position_in_buffer > 0) {

    /* Only do this bit if we have actually seen some OOVs */
    /* Sort final buffer */    
    pc_message(verbosity,2,"Sorting final buffer...\n");

    qsort((void*) buffer,(size_t) position_in_buffer,
	  sizeof(ngram_rec),compare_ngrams2);
    
    /* Write final buffer */
    
    number_of_tempfiles++;
  
    sprintf(temp_word,"%s/%hu%s", temp_directory,
	    number_of_tempfiles,temp_file_ext);
    
    pc_message(verbosity,2,"Writing sorted buffer to temporary file %s\n", temp_word);

    tempfile = rr_fopen(temp_word,"w");
    
    for (i=0;i<=n-1;i++) 
      sort_ngram[i] = buffer[0].word[i];

    sort_count = buffer[0].count;
    
    for (i=1;i<=position_in_buffer-1;i++) {
      
      same_ngram = 1;
      for (j=n-1;j>=0;j--) {
	if (buffer[i].word[j] != sort_ngram[j]) {
	  same_ngram = 0;
	  j = -1;
	}
      }
      
      if (same_ngram) 
	sort_count += buffer[i].count;
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		    tempfile,"temporary n-gram ids");
	  sort_ngram[j] = buffer[i].word[j];
	}
	rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	sort_count = buffer[i].count;
      }
    }	    
    for (j=0;j<=n-1;j++) 
      rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		tempfile,"temporary n-gram ids");

    rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
	      "temporary n-gram counts");
    fclose(tempfile);
    

  }
  

  /* Merge the temporary files, and output the result */
  fclose(non_unk_fp);
  pc_message(verbosity,2,"Merging temporary files...\n");
  merge_idngramfiles(1,
		     number_of_tempfiles,
		     temp_directory,
		     temp_file_ext,
		     max_files,
		     outfile,
		     write_ascii,
		     fof_size,
		     n);

  fclose(outfile);

  rmdir(temp_directory);
  pc_message(verbosity,0,"wngram2idngram : Done.\n");

  return 0;
}
Пример #14
0
ng_t * init_ng(
	     int* argc,
	     char **argv,
	     int verbosity
	     )
{
  int i;
  ng_t* ng;
  
  ng=(ng_t*) rr_calloc(1,sizeof(ng_t));

  ng->disc_meth=NULL;
  /* -n */
  ng->n = pc_intarg(argc, argv,"-n",DEFAULT_N); 

  if (ng->n<1) 
    quit(-1,"Error: Value of n must be larger than zero.\n");

  /* -cutoffs */
  ng->cutoffs = (cutoff_t *) pc_shortarrayarg(argc, argv, "-cutoffs",ng->n-1,ng->n-1);

  if (ng->cutoffs == NULL) 
    ng->cutoffs = (cutoff_t *) rr_calloc((ng->n-1)+1,sizeof(cutoff_t)); /* +1 for the sake of the correction in writing in write_lms.c */

  for (i=0;i<=ng->n-3;i++) {
    if (ng->cutoffs[i+1] < ng->cutoffs[i]) {
      quit(-1,"Error - cutoffs for (n+1)-gram must be greater than or equal to those for \nn-gram. You have %d-gram cutoff = %d > %d-gram cutoff = %d.\n",i+2,ng->cutoffs[i],i+3,ng->cutoffs[i+1]);
    }
  }

  /* -min_unicount */
  ng->min_unicount = pc_intarg(argc, argv, "-min_unicount",0);

  /* -idngram */
  ng->id_gram_filename = salloc(pc_stringarg(argc, argv,"-idngram",""));

  if (!strcmp(ng->id_gram_filename,""))
    quit(-1,"Error: id ngram file not specified. Use the -idngram flag.\n");

  /* -arpa & -bin */
  ng->arpa_filename = salloc(pc_stringarg(argc, argv,"-arpa",""));
  ng->bin_filename = salloc(pc_stringarg(argc, argv,"-binary",""));
  
  ng->write_arpa = strcmp("",ng->arpa_filename);
  ng->write_bin = strcmp("",ng->bin_filename);
  
  if (!(ng->write_arpa || ng->write_bin)) 
    quit(-1,"Error : must specify either an arpa, or a binary output file.\n");

  ng->count_table_size = DEFAULT_COUNT_TABLE_SIZE;

  /* -vocab */
  ng->vocab_filename = salloc(pc_stringarg(argc,argv,"-vocab",""));
 
  if (!strcmp("",ng->vocab_filename))
    quit(-1,"Error : vocabulary file not specified. Use the -vocab option.\n");  

  /* -context */
  ng->context_cues_filename = salloc(pc_stringarg(argc,argv,"-context",""));

  ng->context_set = strcmp("", ng->context_cues_filename);

  /* -vocab_type */
  ng->vocab_type = pc_intarg(argc,argv,"-vocab_type",1);
  
  /* -oov_fraction */
  ng->oov_fraction = pc_doublearg(argc, argv,"-oov_fraction",-1.0);


  if (ng->oov_fraction == -1.0)
    ng->oov_fraction=DEFAULT_OOV_FRACTION;
  else {
    if (ng->vocab_type != 2)
      pc_message(verbosity,1,"Warning : OOV fraction specified, but will not be used, since vocab type is not 2.\n");
  }

  if (ng->vocab_type == 0) 
    ng->first_id = 1;
  else
    ng->first_id = 0;

  /* Allow both "min_alpha" etc and "min_bo_weight" etc as valid
     syntax. The "bo_weight" form is preferred, but the "alpha" form is
     maintained as it was present in version 2.00 */

  ng->min_alpha = pc_doublearg(argc,argv,"-min_alpha",DEFAULT_MIN_ALPHA);
  ng->max_alpha = pc_doublearg(argc,argv,"-max_alpha",DEFAULT_MAX_ALPHA);
  ng->out_of_range_alphas = pc_intarg(argc,argv,"-out_of_range_alphas",
				      DEFAULT_OUT_OF_RANGE_ALPHAS);

  ng->min_alpha = pc_doublearg(argc,argv,"-min_bo_weight",ng->min_alpha);
  ng->max_alpha = pc_doublearg(argc,argv,"-max_bo_weight",ng->max_alpha);
  ng->out_of_range_alphas = pc_intarg(argc,argv,"-out_of_range_bo_weights",
				      ng->out_of_range_alphas);
  
  if (ng->min_alpha >= ng->max_alpha)
    quit(-1,"Error : Minimum of alpha range must be less than the maximum.\n");


  init_ng_disc_method(ng,
		      pc_flagarg(argc, argv,"-linear"),
		      pc_flagarg(argc,argv,"-absolute"),
		      pc_flagarg(argc,argv,"-witten_bell"),
		      pc_flagarg(argc,argv,"-good_turing"));		      
		      
  ng->disc_range = (unsigned short *) pc_shortarrayarg(argc, argv, "-disc_ranges",ng->n,ng->n);

  ng->disc_range_set = (ng->disc_range != NULL);

  if (ng->discounting_method == GOOD_TURING) {
    if (!ng->disc_range_set) {
      ng->disc_range = (unsigned short *) rr_malloc(sizeof(unsigned short) * ng->n);
      ng->disc_range[0] = DEFAULT_DISC_RANGE_1;
      for (i=1;i<=ng->n-1;i++) 
	ng->disc_range[i] = DEFAULT_DISC_RANGE_REST;
    }
    ng->fof_size = (fof_sz_t *) rr_malloc(sizeof(fof_sz_t) * ng->n);
    for (i=0;i<=ng->n-1;i++) 
      ng->fof_size[i] = ng->disc_range[i]+1;

  }else {
    if (ng->disc_range_set) 
      pc_message(verbosity,2,"Warning : discount ranges specified will be ignored, since they only apply\nto Good Turing discounting.\n");
  }

  ng->four_byte_alphas = !(pc_flagarg(argc, argv, "-two_byte_alphas") || 
			   pc_flagarg(argc, argv, "-two_byte_bo_weights"));

  ng->four_byte_counts = pc_flagarg(argc, argv, "-four_byte_counts");
  if(ng->four_byte_counts){
      pc_message(verbosity,2,"Using Four byte counts.\n");
  }

  ng->zeroton_fraction = pc_doublearg(argc,argv,"-zeroton_fraction",1.0);

  /* Attempt to open all the files that we will need for input and
     output. It is better to do it here than to spend a few hours of
     CPU processing id-gram counts, only to find that the output path
     is invalid. */

  ng->id_gram_fp = rr_iopen(ng->id_gram_filename);

  /* Vocab is read by Roni's function which does the file opening for
     us, so no need to do it here. Don't need to worry about time
     being lost if file doesn't exist, since vocab is first thing to
     be read anyway. */

  if (ng->context_set)
    ng->context_cues_fp = rr_iopen(ng->context_cues_filename);

  if (ng->write_arpa)
    ng->arpa_fp = rr_oopen(ng->arpa_filename);

  if (ng->write_bin) 
    ng->bin_fp = rr_oopen(ng->bin_filename);

  return ng;
}
Пример #15
0
void main(int argc, char *argv[]) {

  int verbosity;
  int n;
  int m;
  int i;
  int input_type;
  int storage_type;
  unsigned short *current_ngram_int;
  unsigned short *previous_ngram_int;
  char **current_ngram_text;
  char **previous_ngram_text;
  int current_count;
  int running_total;
  flag same;
  flag first_one;
  flag got_to_eof;
   
  running_total = 0;

  report_version(&argc,argv);

  if (pc_flagarg( &argc, argv,"-help") || argc==1) {
    fprintf(stderr,"ngram2mgram - Convert an n-gram file to an m-gram file, where m<n\n");
    fprintf(stderr,"Usage : ngram2mgram   -n N -m M\n");
    fprintf(stderr,"                    [ -binary | -ascii | -words ]\n");
    fprintf(stderr,"                    < .ngram > .mgram\n");
    exit(1);
  }
 
  n = pc_intarg( &argc, argv,"-n",0);
  m = pc_intarg( &argc, argv,"-m",0);
  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
  

  input_type = 0;
  
  if (pc_flagarg( &argc, argv,"-binary")) {
    input_type = BINARY;
  }

  if (pc_flagarg( &argc, argv,"-ascii")) {
    if (input_type != 0) {
      quit(-1,"Error : more than one file format specified.\n");
    }
    input_type = ASCII;
  }

  if (pc_flagarg( &argc, argv,"-words")) {  
    if (input_type != 0) {
      quit(-1,"Error : more than one file format specified.\n");
    }
    input_type = WORDS;
  }    

  if (input_type == 0) {
    pc_message(verbosity,2,"Warning : no input type specified. Defaulting to binary.\n");
    input_type = BINARY;
  }

  if (n == 0) {
    quit(-1,"Must specify a value for n. Use the -n switch.\n");
  }

  if (m == 0) {
    quit(-1,"Must specify a value for m. Use the -m switch.\n");
  }
  
  if (n<=m) {
    quit(-1,"n must be greater than m.\n");
  }

  pc_report_unk_args(&argc,argv,verbosity);

  if (input_type == BINARY || input_type == ASCII) {
    storage_type = NUMERIC;
  }
  else {
    storage_type = ALPHA;
  }

  if (storage_type == NUMERIC) {
    current_ngram_int = (unsigned short *) 
      rr_malloc(n*sizeof(unsigned short));
    previous_ngram_int = (unsigned short *) 
      rr_malloc(n*sizeof(unsigned short));

    /* And to prevent compiler warnings ... */

    current_ngram_text = NULL;
    previous_ngram_text = NULL;
  }
  else {
    current_ngram_text = (char **) rr_malloc(n*sizeof(char *));
    previous_ngram_text = (char **) rr_malloc(n*sizeof(char *));
    for (i=0;i<=n-1;i++) {
      current_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
      previous_ngram_text[i] = (char *) rr_malloc(MAX_WORD_LENGTH*sizeof(char));
    }

    /* And to prevent compiler warnings ... */

    current_ngram_int = NULL;
    previous_ngram_int = NULL;

  }

  got_to_eof = 0;
  first_one = 1;

  while (!rr_feof(stdin)) {

    /* Store previous n-gram */

    if (!first_one) {

      if (storage_type == NUMERIC) {
	for (i=0;i<=n-1;i++) {
	  previous_ngram_int[i] = current_ngram_int[i];
	}
      }
      else {
	for (i=0;i<=n-1;i++) {
	  strcpy(previous_ngram_text[i],current_ngram_text[i]);
	}
      }

    }

    /* Read new n-gram */

    switch(input_type) {
    case BINARY:
      for (i=0;i<=n-1;i++) {
	rr_fread(&current_ngram_int[i],sizeof(id__t),1,stdin,
		 "from id_ngrams at stdin",0);
      }
      rr_fread(&current_count,sizeof(count_t),1,stdin,
	       "from id_ngrams file at stdin",0);
      break;
    case ASCII:
      for (i=0;i<=n-1;i++) {
	if (fscanf(stdin,"%hu",&current_ngram_int[i]) != 1) {
	  if (!rr_feof(stdin)) {
	    quit(-1,"Error reading id_ngram.\n");
	  }
	  else {
	    got_to_eof = 1;
	  }
	}
      }
      if (fscanf(stdin,"%d",&current_count) != 1) {
	if (!rr_feof(stdin)) {
	  quit(-1,"Error reading id_ngram.\n");
	}
	else {
	  got_to_eof = 1;
	}
      }
      break;
    case WORDS:
      for (i=0;i<=n-1;i++) {
	if (fscanf(stdin,"%s",current_ngram_text[i]) != 1) {
	  if (!rr_feof(stdin)) {
	    quit(-1,"Error reading id_ngram.\n");
	  }
	  else {
	    got_to_eof = 1;
	  }
	}
      }
      if (fscanf(stdin,"%d",&current_count) != 1) {
	if (!rr_feof(stdin)) {
	  quit(-1,"Error reading id_ngram.\n");
	}
	else {
	  got_to_eof = 1;
	}
      }
      break;
    }

    if (!got_to_eof) {

      /* Check for correct sorting */

      if (!first_one) {

	switch(storage_type) {
	case NUMERIC:
	  for (i=0;i<=n-1;i++) {
	    if (current_ngram_int[i]<previous_ngram_int[i]) {
	      quit(-1,"Error : ngrams not correctly sorted.\n");
	    }
	    else {
	      if (current_ngram_int[i]>previous_ngram_int[i]) {
		i=n;
	      }
	    }
	  }
	  break;
	case ALPHA:
	  for (i=0;i<=n-1;i++) {
	    if (strcmp(current_ngram_text[i],previous_ngram_text[i])<0) {
	      quit(-1,"Error : ngrams not correctly sorted.\n");
	    }
	    else {
	      if (strcmp(current_ngram_text[i],previous_ngram_text[i])>0) {
		i=n;
	      }
	    }
	  }
	  break;
	}
      }

      /* Compare this m-gram with previous m-gram */

      if (!first_one) {

	switch(storage_type) {
	case NUMERIC:
	  same = 1;
	  for (i=0;i<=m-1;i++) {
	    if (current_ngram_int[i] != previous_ngram_int[i]) {
	      same = 0;
	    }
	  }
	  if (same) {
	    running_total += current_count;
	  }
	  else {
	    if (input_type == ASCII) {
	      for (i=0;i<=m-1;i++) {
		printf("%d ",previous_ngram_int[i]);
	      }
	      printf("%d\n",running_total);
	    }
	    else {
	      for (i=0;i<=m-1;i++) {
		rr_fwrite(&previous_ngram_int[i],sizeof(id__t),1,stdout,
			  "to id_ngrams at stdout");
	      }
	      rr_fwrite(&running_total,sizeof(count_t),1,stdout,
			"to id n-grams at stdout");
	    }
	    running_total = current_count;
	  }
	  break;
	case ALPHA:
	  same = 1;
	  for (i=0;i<=m-1;i++) {
	    if (strcmp(current_ngram_text[i],previous_ngram_text[i])) {
	      same = 0;
	    }
	  }
	  if (same) {
	    running_total += current_count;
	  }
	  else {
	    for (i=0;i<=m-1;i++) {
	      printf("%s ",previous_ngram_text[i]);
	    }
	    printf("%d\n",running_total);
	    running_total = current_count;
	  
	  }
	  break;
	}
      
      }
      else {
	running_total = current_count;
      } 
    
      first_one = 0;
    
    }
  }

  /* Write out final m-gram */

  switch(input_type) {
  case BINARY:
    break;
  case ASCII:
    for (i=0;i<=m-1;i++) {
      printf("%d ",previous_ngram_int[i]);
    }
    printf("%d\n",running_total);
    break;
  case WORDS:
    for (i=0;i<=m-1;i++) {
      printf("%s ",previous_ngram_text[i]);
    }
    printf("%d\n",running_total);
    break;
  } 

  pc_message(verbosity,0,"ngram2mgram : Done.\n");

  exit(0);

}	  
Пример #16
0
void compute_back_off(ng_t *ng,int n, int verbosity) {

  int *current_pos;
  int *end_pos;
  id__t *sought_ngram;
  int current_table;
  int ng_count;
  int i;
  double sum_cond_prob;
  double sum_bo_prob;
  double discounted_ngcount;
  double cond_prob;
  double bo_prob;
  double discount_mass;
  double leftout_bo_prob;
  double alpha;

  int bo_case;

  sum_cond_prob = 0.0;
  sum_bo_prob = 0.0;

  /* For the sake of warning-free compilation... */

  discounted_ngcount = 0.0;
  
  current_pos = (int *)rr_calloc(n+1,sizeof(int));
  sought_ngram = (id__t *) rr_calloc(n+1,sizeof(id__t));
  end_pos = (int *)rr_calloc(n+1,sizeof(int)); 
  
  /* Process the tree so that we get all the n-grams out in the right
     order. */
  
  for (current_pos[0]=ng->first_id;
       current_pos[0]<=ng->vocab_size;
       current_pos[0]++) {
    
    if (return_count(ng->four_byte_counts,
		     ng->count_table[0],
		     ng->marg_counts,
		     ng->marg_counts4,
		     current_pos[0]) > 0) {

      current_table = 1;
      
      if (current_pos[0] == ng->vocab_size) {
	end_pos[1] = ng->num_kgrams[1]-1;
      }
      else {
 	end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1],
				    ng->ptr_table[0],
				    ng->ptr_table_size[0],
				    current_pos[0]+1)-1;
      }

      while (current_table > 0) {

	if (current_table == n) {

	  if (current_pos[n] <= end_pos[n]){

	    ng_count = return_count(ng->four_byte_counts,
				    ng->count_table[n],
				    ng->count[n],
				    ng->count4[n],
				    current_pos[n]);

	    switch (ng->discounting_method) {
	    case GOOD_TURING:
	      if (ng_count <= ng->disc_range[n]) {
		discounted_ngcount = ng->gt_disc_ratio[n][ng_count] * ng_count;
	      }
	      else {
		discounted_ngcount = ng_count;
	      }
	      break;
	    case LINEAR:
	      discounted_ngcount = ng->lin_disc_ratio[n] * ng_count;
	      break;
	    case ABSOLUTE:
	      discounted_ngcount = ng_count - ng->abs_disc_const[n];
	      break;
	    case WITTEN_BELL:
	      if (n==1) {

		discounted_ngcount = ((double) 
				      return_count(ng->four_byte_counts,
						   ng->count_table[0],
						   ng->marg_counts,
						   ng->marg_counts4,
						   current_pos[0]) * ng_count)
		  / (return_count(ng->four_byte_counts,
				  ng->count_table[0],
				  ng->marg_counts,
				  ng->marg_counts4,
				  current_pos[0]) + 
		     num_of_types(0,current_pos[0],ng));
	      }
	      else {
		
		discounted_ngcount = ((double) 
				      return_count(ng->four_byte_counts,
						   ng->count_table[n-1],
						   ng->count[n-1],
						   ng->count4[n-1],
						   current_pos[n-1])* ng_count)
		  / (return_count(ng->four_byte_counts,
				  ng->count_table[n-1],
				  ng->count[n-1],
				  ng->count4[n-1],
				  current_pos[n-1]) + 
		     num_of_types(n-1,current_pos[n-1],ng));

	      }	  
	      
	      break;
	    }

	    if (n==1) {
	      cond_prob = ((double) discounted_ngcount / 
			   return_count(ng->four_byte_counts,
					ng->count_table[0],
					ng->marg_counts,
					ng->marg_counts4,
					current_pos[0]));
	    }
	    else {
	      cond_prob = ((double) discounted_ngcount /  
			   return_count(ng->four_byte_counts,
					ng->count_table[n-1],
					ng->count[n-1],
					ng->count4[n-1],
					current_pos[n-1]));

	    }
	    sum_cond_prob += cond_prob;

	    /* Fill up sought ngram array with correct stuff */

	    for (i=1;i<=n;i++) {
	      sought_ngram[i-1] = ng->word_id[i][current_pos[i]];
	    }


	    bo_ng_prob(n-1,sought_ngram,ng,verbosity,&bo_prob,&bo_case);
	    sum_bo_prob += bo_prob;
	    current_pos[n]++;			
					       
	  }
	  else {

	    discount_mass = 1.0 - sum_cond_prob;

	    if (discount_mass < 1e-10) {
	      discount_mass = 0.0;
	      pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ",
			 ng->vocab[current_pos[0]],current_pos[0]);
	      for (i=1;i<=n-1;i++) {
		pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]);
	      }
	      pc_message(verbosity,2,
			 "is set to 0 (sum of probs = %f).\nMay cause problems with zero probabilities.\n",sum_cond_prob);
	    }

	    leftout_bo_prob = 1.0 - sum_bo_prob;
	    if (leftout_bo_prob < 1e-10) {
	      leftout_bo_prob = 0.0;
	    }

	    if (leftout_bo_prob > 0.0) {
	      alpha = discount_mass / leftout_bo_prob;
	    }
	    else {
	      alpha = 0.0;	/* Will not be used. Should happen very rarely. */
	      pc_message(verbosity,2,"Warning : Back off weight for %s(id %d) ",
			 ng->vocab[current_pos[0]],current_pos[0]);
	      for (i=1;i<=n-1;i++) {
		pc_message(verbosity,2,"%s(id %d) ",ng->vocab[ng->word_id[i][current_pos[i]]],ng->word_id[i][current_pos[i]]);
	      }
	      pc_message(verbosity,2,
			 "is set to 0.\nMay cause problems with zero probabilities.\n");

	    }
	  
	    if (ng->four_byte_alphas) {
	      ng->bo_weight4[n-1][current_pos[n-1]] = alpha;
	    }
	    else {
	      ng->bo_weight[n-1][current_pos[n-1]] = 
		short_alpha(alpha,
			    ng->alpha_array,
			    &(ng->size_of_alpha_array),
			    65535 - ng->out_of_range_alphas,
			    ng->min_alpha,
			    ng->max_alpha);
	    }
	  
	    /* Finished current (n-1)-gram */

	    sum_cond_prob = 0.0;
	    sum_bo_prob = 0.0;
	    current_table--;
	    if (current_table > 0) {
	      current_pos[current_table]++;
	    }
	  }
	}
	else {

	  if (current_pos[current_table] <= end_pos[current_table]) {
	    current_table++;
	    if (current_pos[current_table-1] == ng->num_kgrams[current_table-1]-1) {
	      end_pos[current_table] = ng->num_kgrams[current_table]-1;
	    }
	    else {
	      end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1],ng->ptr_table[current_table-1],ng->ptr_table_size[current_table-1],current_pos[current_table-1]+1)-1;
	    }
	  }
	  else {
	    current_table--;
	    if (current_table > 0) {
	      current_pos[current_table]++;
	    }
	  }
	}
      }
    }

    /* Now deal with zeroton unigrams */

    else {
      if (n == 1) {
	if (ng->four_byte_alphas) {
	  ng->bo_weight4[0][current_pos[0]] = 1.0;
	}
	else {
	  ng->bo_weight[0][current_pos[0]] = 
	    short_alpha(1.0,
			ng->alpha_array,
			&(ng->size_of_alpha_array),
			65535 - ng->out_of_range_alphas,
			ng->min_alpha,
			ng->max_alpha);
	}
      }
    }
  }
  free(end_pos);
  free(current_pos);
  free(sought_ngram);
  
}
Пример #17
0
void report_param(int verbosity, ng_t *ng, 
		  flag is_ascii,
		  flag mem_alloc_method,
		  int buffer_size
		  )
{
  int i;
  pc_message(verbosity,2,"  n : %d\n",ng->n);
  pc_message(verbosity,2,"  Input file : %s",ng->id_gram_filename);
  if (is_ascii) 
    pc_message(verbosity,2,"     (ascii format)\n");
  else 
    pc_message(verbosity,2,"     (binary format)\n");

  pc_message(verbosity,2,"  Output files :\n");

  if (ng->write_arpa) 
    pc_message(verbosity,2,"     ARPA format   : %s\n",ng->arpa_filename);
  if (ng->write_bin) 
    pc_message(verbosity,2,"     Binary format : %s\n",ng->bin_filename);

  pc_message(verbosity,2,"  Vocabulary file : %s\n",ng->vocab_filename);
  if (ng->context_set) 
    pc_message(verbosity,2,"  Context cues file : %s\n",ng->context_cues_filename);

  pc_message(verbosity,2,"  Cutoffs :\n     ");
  for (i=0;i<=ng->n-2;i++) 
    pc_message(verbosity,2,"%d-gram : %d     ",i+2,ng->cutoffs[i]);

  pc_message(verbosity,2,"\n");

  switch (ng->vocab_type) {
  case CLOSED_VOCAB:
    pc_message(verbosity,2,"  Vocabulary type : Closed\n");
    break;
  case OPEN_VOCAB_1:
    pc_message(verbosity,2,"  Vocabulary type : Open - type 1\n");
    break;
  case OPEN_VOCAB_2:
    pc_message(verbosity,2,"  Vocabulary type : Open - type 2\n");
    pc_message(verbosity,2,"     OOV fraction = %g\n",ng->oov_fraction);
    break;
  }
  pc_message(verbosity,2,"  Minimum unigram count : %d\n",ng->min_unicount);
  pc_message(verbosity,2,"  Zeroton fraction : %g\n",ng->zeroton_fraction);
  if (ng->four_byte_counts) 
    pc_message(verbosity,2,"  Counts will be stored in four bytes.\n");
  else {
    pc_message(verbosity,2,"  Counts will be stored in two bytes.\n");
    pc_message(verbosity,2,"  Count table size : %d\n",ng->count_table_size);
  }

  pc_message(verbosity,2,"  Discounting method : ");
  NG_DISC_METH(ng)->verbose_method(ng, verbosity);

  pc_message(verbosity,2,"  Memory allocation for tree structure : \n");
  switch(mem_alloc_method) {
  case TWO_PASSES:
    pc_message(verbosity,2,"     Perform a preliminary pass over the id n-gram file to determine \n     the amount of memory to allocate\n");
    break;
  case BUFFER:
    pc_message(verbosity,2,"     Allocate %d MB of memory, shared equally between all n-gram tables.\n",buffer_size);
    break;
  case SPECIFIED:
    pc_message(verbosity,2,"     Memory requirement specified.\n          ");
    for (i=0;i<=ng->n-2;i++)
      pc_message(verbosity,2,"%d-gram : %d     ",i+2,ng->table_sizes[i+1]);
    pc_message(verbosity,2,"\n");
    break;
  }
  pc_message(verbosity,2,"  Back-off weight storage : \n");

  if (ng->four_byte_alphas) 
    pc_message(verbosity,2,"     Back-off weights will be stored in four bytes.\n");
  else {
    pc_message(verbosity,2,"     Back-off weights will be stored in two bytes.\n");
    pc_message(verbosity,2,"        Minimum back-off weight : %g\n",ng->min_alpha);
    pc_message(verbosity,2,"        Maximum back-off weight : %g\n",ng->max_alpha);
    pc_message(verbosity,2,"        Maximum number of out of range back-off weights : %d\n",ng->out_of_range_alphas);
  }

}
Пример #18
0
void write_arpa_lm(ng_t *ng,int verbosity) {
    
    int *current_pos;
    int *end_pos;
    ngram_sz_t i;
    double log_10_of_e = 1.0 / log(10.0);
    
    /* HEADER */
    
    pc_message(verbosity,1,"ARPA-style %d-gram will be written to %s\n",ng->n,ng->arpa_filename);
    
    write_arpa_copyright(ng->arpa_fp,ng->n,ng->vocab_size, ng->vocab[1],ng->vocab[2],ng->vocab[3]);
    
    display_vocabtype(ng->vocab_type,ng->oov_fraction, ng->arpa_fp);  
    display_discounting_method(ng,ng->arpa_fp);
    write_arpa_format(ng->arpa_fp,ng->n);
    write_arpa_num_grams(ng->arpa_fp,ng,NULL,0);
    write_arpa_k_gram_header(ng->arpa_fp,1);
    
    for (i=ng->first_id; i<= (int) ng->vocab_size;i++) {
        
        double log10_uniprob;
        double log10_alpha;
        double alpha;
        
        log10_uniprob = ng->uni_log_probs[i]*log_10_of_e;
        
        if (ng->uni_probs[i]<=0.0)
            log10_uniprob = BAD_LOG_PROB;
        
        alpha=ng_double_alpha(ng,0,i);
        
        if(alpha > 0.0)
            log10_alpha = log10(alpha);
        else
            log10_alpha = BAD_LOG_PROB;
        
        fprintf(ng->arpa_fp,"%.4f %s",log10_uniprob,ng->vocab[i]);
        if (ng->n>1)
            fprintf(ng->arpa_fp,"\t%.4f\n",log10_alpha);
        else
            fprintf(ng->arpa_fp,"\n");
    }
    
    current_pos = (int *) rr_malloc(ng->n*sizeof(int));
    end_pos = (int *) rr_malloc(ng->n*sizeof(int)); 
    
    /* Print 2-gram, ... (n-1)-gram info. */
    
    for (i=1;i<=ng->n-1;i++) {
        
        /* Print out the (i+1)-gram */
        
        int current_table, j;
        count_t ngcount, marg_count;
        double discounted_ngcount;    
        double ngprob, log_10_ngprob, ngalpha, log_10_ngalpha;
        
        /* Initialise variables for the sake of warning-free compilation */
#ifdef STATICANALYZEDEPENDENCIES
#define __clang_analyzer__ 1
#endif
#if !defined(__clang_analyzer__) || defined(STATICANALYZEDEPENDENCIES)
#undef __clang_analyzer__
        
        discounted_ngcount = 0.0;
        log_10_ngalpha = 0.0;
#endif
        write_arpa_k_gram_header(ng->arpa_fp,i+1);
        
        /* Go through the n-gram list in order */
        
        for (j=0;j<=ng->n-1;j++) {
            current_pos[j] = 0;
            end_pos[j] = 0;
        }
        
        for (current_pos[0]=ng->first_id;
             current_pos[0]<=(int) ng->vocab_size;
             current_pos[0]++) {
            
            if (return_count(ng->four_byte_counts,
                             ng->count_table[0], 
                             ng->marg_counts,
                             ng->marg_counts4,
                             current_pos[0]) > 0) {
                
                current_table = 1;
                
                if (current_pos[0] == (int) ng->vocab_size)
                    end_pos[1] = (int ) ng->num_kgrams[1]-1;
                else {
                    end_pos[1] = get_full_index(ng->ind[0][current_pos[0]+1],
                                                ng->ptr_table[0],
                                                ng->ptr_table_size[0],
                                                current_pos[0]+1)-1;
                }
                
                while (current_table > 0) {
                    
                    /*	  fprintf(stderr, "i %d, current_pos[i] %d, end_pos[i] %d\n",
                     i,
                     current_pos[i],
                     end_pos[i]);
                     fflush(stderr);*/
                    
                    
                    if (current_table == i) {
                        
                        if (current_pos[i] <= end_pos[i]) {
                            
                            /*	      fprintf(stderr, "%d\n",ng->count[i][current_pos[i]]);
                             fprintf(stderr, "%d\n",ng->count_table[i][ng->count[i][current_pos[i]]]);*/
                            
                            ngcount = return_count(ng->four_byte_counts,
                                                   ng->count_table[i],
                                                   ng->count[i],
                                                   ng->count4[i],
                                                   current_pos[i]);
                            
                            
                            if (i==1) {
                                marg_count = return_count(ng->four_byte_counts,
                                                          ng->count_table[0], 
                                                          ng->marg_counts,
                                                          ng->marg_counts4,
                                                          current_pos[0]);
                            }else {
                                marg_count = return_count(ng->four_byte_counts,
                                                          ng->count_table[i-1],
                                                          ng->count[i-1],
                                                          ng->count4[i-1],
                                                          current_pos[i-1]);
                            }
                            
                            if(ng->disc_meth==NULL)
                                ng->disc_meth=(disc_meth_t*) disc_meth_init(ng->discounting_method);
                            
                            assert(ng->disc_meth);
                            discounted_ngcount = 
                            NG_DISC_METH(ng)->dump_discounted_ngram_count(ng,i,ngcount,marg_count,current_pos);
                            
                            ngprob = (double) discounted_ngcount / marg_count;
                            
                            if (ngprob > 1.0) {
                                fprintf(stderr,
                                        "discounted_ngcount = %f marg_count = %d %d %d %d\n",
                                        discounted_ngcount,marg_count,current_pos[0],
                                        current_pos[1],current_pos[2]);
                                quit(-1,"Error : probablity of ngram is greater than one.\n");
                            }
                            
                            if (ngprob > 0.0) 
                                log_10_ngprob = log10(ngprob);
                            else 
                                log_10_ngprob = BAD_LOG_PROB;
                            
                            if (i <= ng->n-2) {
                                ngalpha = ng_double_alpha(ng, i, current_pos[i]);
                                
                                if (ngalpha > 0.0)
                                    log_10_ngalpha = log10(ngalpha);
                                else
                                    log_10_ngalpha = BAD_LOG_PROB;
                            }
                            // BEGIN HLW VERSION
                            if(((strstr (ng->vocab[current_pos[0]],"</s>")) == NULL)&&((i <= 1) || ((i > 1) && ((strstr (ng->vocab[(unsigned int) ng->word_id[i][current_pos[i]]],"<s>")) == NULL)))) { // if the overall entry is a trigram and it's going to end with <s>, skip it -- HLW
                                
                                fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob);
                                fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]);
                                for (j=1;j<=i;j++){
                                    
                                    fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]);
                                }
                                
                                if (i <= ng->n-2){
                                    fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha);
                                } else{
                                    fprintf(ng->arpa_fp,"\n");
                                }
                            } else {
                                // something is being skipped  -- HLW
                                if(i==0) {
                                    skipped_unigrams++;
                                } else if(i==1) {
                                    skipped_bigrams++;
                                } else if (i==2) {
                                    skipped_trigrams++;
                                }
                            }
                            // END HLW VERSION
                            
                            // PREVIOUS VERSION:
                            
                            /*
                             if (i <= ng->n-2) {
                             ngalpha = ng_double_alpha(ng, i, current_pos[i]);
                             
                             if (ngalpha > 0.0)
                             log_10_ngalpha = log10(ngalpha);
                             else
                             log_10_ngalpha = BAD_LOG_PROB;
                             }
                             
                             fprintf(ng->arpa_fp,"%.4f ",log_10_ngprob);
                             fprintf(ng->arpa_fp,"%s ",ng->vocab[current_pos[0]]);
                             for (j=1;j<=i;j++){
                             
                             //		fprintf(stderr, "j %d, ng->wordid[j] %u, current_pos[j] %d, ng->word_id[j][current_pos[j]] %u\n",j, ng->word_id[j], current_pos[j], ng->word_id[j][current_pos[j]]);
                             
                             fprintf(ng->arpa_fp,"%s ",ng->vocab[(unsigned int) ng->word_id[j][current_pos[j]]]);
                             }
                             
                             if (i <= ng->n-2)
                             fprintf(ng->arpa_fp,"%.4f\n",log_10_ngalpha);
                             else
                             fprintf(ng->arpa_fp,"\n");
                             */
                            
                            current_pos[i]++;        
                        }else {
                            current_table--;
                            if (current_table > 0)
                                current_pos[current_table]++;
                        }
                    }else {
                        
                        if (current_pos[current_table] <= end_pos[current_table]) {
                            current_table++;
                            if (current_pos[current_table-1] == (int) ng->num_kgrams[current_table-1]-1)
                                end_pos[current_table] = (int) ng->num_kgrams[current_table]-1;
                            else {
                                end_pos[current_table] = get_full_index(ng->ind[current_table-1][current_pos[current_table-1]+1],
                                                                        ng->ptr_table[current_table-1],
                                                                        ng->ptr_table_size[current_table-1],
                                                                        current_pos[current_table-1]+1) - 1;
                            }
                        }else {
                            current_table--;
                            if (current_table > 0)
                                current_pos[current_table]++;
                        }
                    }
                }
            }
        }
    } 
    
    free(current_pos);
    free(end_pos);
    
    fprintf(ng->arpa_fp,"\n\\end\\\n");
    
    rr_oclose(ng->arpa_fp);
    
    // BEGIN HLW ADDITION
    
    // Now that the file is complete, let's go back and replace the placeholder ngram counts with the real final counts  -- HLW
    
    final_ngram_count_replacement(ng->n,ng);
    
    unigram_count = 0;
    bigram_count = 0;
    trigram_count = 0;
    skipped_unigrams = 0;
    skipped_bigrams = 0;
    skipped_trigrams = 0;
    
    // END HLW ADDITION
} 
Пример #19
0
/*
  @return number_of_tempfiles
 */
int  read_txt2ngram_buffer(FILE* infp, 
			   struct idngram_hash_table *vocabulary, 
			   int32 verbosity,
			   wordid_t *buffer,
			   int buffer_size,
			   unsigned int n,
			   char* temp_file_root,
			   char* temp_file_ext,
			   FILE* temp_file
			   )
{
  /* Read text into buffer */
  char temp_word[MAX_WORD_LENGTH];
  int position_in_buffer;
  int number_of_tempfiles;
  unsigned int i,j;
  wordid_t *placeholder;
  wordid_t *temp_ngram;
  int temp_count;

#if 1
  int tmpval;
#endif

  temp_ngram  = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);
  placeholder = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  ng=n;

  position_in_buffer = 0;
  number_of_tempfiles = 0;

  //tk: looks like things may croak if the corpus has less than n words
  //not that such a corpus would be useful anyway
  for (i=0;i<=n-1;i++) {
    get_word(infp,temp_word);
    /*
        fprintf(stderr,"%s \n",temp_word);
	fprintf(stderr,"%d \n",index2(vocabulary,temp_word));
        fflush(stderr);
    */
    add_to_buffer(index2(vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(infp)) {
    /* Fill up the buffer */
    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

    while ((position_in_buffer<buffer_size) && (!rr_feof(infp))) {
      position_in_buffer++;
      show_idngram_nlines(position_in_buffer,verbosity);

      for (i=1;i<=n-1;i++) 
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      
      if (get_word(infp,temp_word) == 1) {
      /*
	fprintf(stderr,"%s \n",temp_word);
	fprintf(stderr,"%d \n",index2(vocabulary,temp_word));
	fflush(stderr);
      */
	add_to_buffer(index2(vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) 
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");    
    
    qsort((void*) buffer,(size_t) position_in_buffer,n*sizeof(wordid_t),compare_ngrams);

    /* Output the buffer to temporary BINARY file */    
    number_of_tempfiles++;

    sprintf(temp_word,"%s/%hu%s",temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    temp_file = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
#if MAX_VOCAB_SIZE < 65535
      /* This check is well-meaning but completely useless since
	 buffer_contents() can never return something greater than
	 MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */
      if (temp_ngram[i] > MAX_VOCAB_SIZE)
	quit(-1,"Invalid trigram in buffer.\nAborting");
#endif
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {

      tmpval=compare_ngrams(temp_ngram,&buffer[i*n]);

      /*      for(k=0;k<=n-1;k++){
	fprintf(stderr, "tmpval: %d k %d, temp_ngram %d, &buffer[i*n] %d\n",tmpval, k, temp_ngram[k], (&buffer[i*n])[k]);
	}*/

      if (!compare_ngrams(temp_ngram,&buffer[i*n])) 
	temp_count++;
      else {
	/*	printf("Have been here?\n");*/
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*) &temp_ngram[j],sizeof(wordid_t),1,
		    temp_file,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite((char*)&temp_count,sizeof(int),1,temp_file,
		  "temporary n-gram counts");

	/*	for(j=0 ; j<=n-1;j++)
	  fprintf(stderr,"%d ",temp_ngram[j]);
	  fprintf(stderr,"%d\n",temp_count);*/

	temp_count = 1;
      }
    }
    
    rr_oclose(temp_file);

    for (i=0;i<=n-1;i++) 
      add_to_buffer(placeholder[i],0,i,buffer);

    position_in_buffer = 0;

  }

  return number_of_tempfiles;
}
Пример #20
0
void write_bin_lm(ng_t *ng,int verbosity) {
    
    int l_chunk;
    int from_rec;
    int i;
    
    pc_message(verbosity,1,"Binary %d-gram language model will be written to %s\n",ng->n,ng->bin_filename);
    
    ng->version = BBO_FILE_VERSION;
    
    /* Scalar parameters */
    
    rr_fwrite((char*)&ng->version,sizeof(int),1,ng->bin_fp,"version");
    rr_fwrite((char*)&ng->n,sizeof(unsigned short),1,ng->bin_fp,"n");
    
    rr_fwrite((char*)&ng->vocab_size,sizeof(wordid_t),1,ng->bin_fp,"vocab_size");
    rr_fwrite((char*)&ng->no_of_ccs,sizeof(unsigned short),1,ng->bin_fp,"no_of_ccs");
    rr_fwrite((char*)&ng->vocab_type,sizeof(unsigned short),1,ng->bin_fp,"vocab_type");
    
    rr_fwrite((char*)&ng->count_table_size,sizeof(count_ind_t),1,ng->bin_fp,"count_table_size");
    rr_fwrite((char*)&ng->discounting_method,sizeof(unsigned short),1,ng->bin_fp,"discounting_method");
    
    rr_fwrite((char*)&ng->min_alpha,sizeof(double),1,ng->bin_fp,"min_alpha");
    rr_fwrite((char*)&ng->max_alpha,sizeof(double),1,ng->bin_fp,"max_alpha");
    rr_fwrite((char*)&ng->out_of_range_alphas,sizeof(unsigned short),1,ng->bin_fp,"out_of_range_alphas");
    rr_fwrite((char*)&ng->size_of_alpha_array,sizeof(unsigned short),1,ng->bin_fp,"size_of_alpha_array");  
    
    rr_fwrite((char*)&ng->n_unigrams,sizeof(ngram_sz_t),1,ng->bin_fp,"n_unigrams");
    rr_fwrite((char*)&ng->zeroton_fraction,sizeof(double),1,ng->bin_fp,"zeroton_fraction");
    rr_fwrite((char*)&ng->oov_fraction,sizeof(double),1,ng->bin_fp,"oov_fraction");
    rr_fwrite((char*)&ng->four_byte_counts,sizeof(flag),1,ng->bin_fp,"four_byte_counts");
    rr_fwrite((char*)&ng->four_byte_alphas,sizeof(flag),1,ng->bin_fp,"four_byte_alphas");
    
    rr_fwrite((char*)&ng->first_id,sizeof(unsigned short),1,
              ng->bin_fp,"first_id");
    
    /* Short and shortish arrays */
    
    sih_val_write_to_file(ng->vocab_ht,ng->bin_fp,ng->bin_filename,0);
    
    /* (ng->vocab is not stored in file - will be derived from ng->vocab_ht) */
    
    if (ng->four_byte_counts==1) {
        assert(ng->marg_counts4);
        rr_fwrite((char*)ng->marg_counts4,sizeof(count_t),
                  ng->vocab_size+1,ng->bin_fp,"marg_counts");
    }else {
        assert(ng->marg_counts);
        rr_fwrite((char*)ng->marg_counts,sizeof(count_ind_t),
                  ng->vocab_size+1,ng->bin_fp,"marg_counts");
    }
    
    rr_fwrite((char*)ng->alpha_array,sizeof(double),
              ng->size_of_alpha_array,ng->bin_fp,"alpha_array");
    
    if (!ng->four_byte_counts) {
        for (i=0;i<=ng->n-1;i++)
            rr_fwrite((char*)ng->count_table[i],sizeof(count_t),
                      ng->count_table_size+1,ng->bin_fp,"count_table");
    }
    
    /* Could write count_table as one block, but better to be safe and
     do it in chunks. For motivation, see comments about writing tree
     info. */
    
    rr_fwrite((char*)ng->ptr_table_size,sizeof(ptr_tab_sz_t),ng->n,ng->bin_fp,"ptr_table_size");
    
    for (i=0;i<=ng->n-1;i++)
        rr_fwrite((char*)ng->ptr_table[i],sizeof(ptr_tab_t),ng->ptr_table_size[i],ng->bin_fp,"ptr_table");
    
    /* Unigram statistics */
    
    rr_fwrite((char*)ng->uni_probs,sizeof(uni_probs_t), ng->vocab_size+1,
              ng->bin_fp,"uni_probs");
    rr_fwrite((char*)ng->uni_log_probs,sizeof(uni_probs_t),ng->vocab_size+1,
              ng->bin_fp,"uni_log_probs");
    rr_fwrite((char*)ng->context_cue,sizeof(flag),ng->vocab_size+1,
              ng->bin_fp,"context_cue");
    
    rr_fwrite((char*)ng->cutoffs,sizeof(cutoff_t),ng->n,ng->bin_fp,"cutoffs");
    
    switch (ng->discounting_method) {
        case GOOD_TURING:
            rr_fwrite((char*)ng->fof_size,sizeof(fof_sz_t),ng->n,ng->bin_fp,"fof_size");
            rr_fwrite((char*)ng->disc_range,sizeof(unsigned short),ng->n,
                      ng->bin_fp,"disc_range");
            for (i=0;i<=ng->n-1;i++) {
                rr_fwrite((char*)ng->freq_of_freq[i],sizeof(fof_t),
                          ng->fof_size[i]+1,ng->bin_fp,"freq_of_freq");
            }    
            for (i=0;i<=ng->n-1;i++) {
                rr_fwrite((char*)ng->gt_disc_ratio[i],sizeof(disc_val_t),
                          ng->disc_range[i]+1,ng->bin_fp,"gt_disc_ratio");
            }    
        case WITTEN_BELL:
            break;
        case LINEAR:
            rr_fwrite((char*)ng->lin_disc_ratio,sizeof(disc_val_t),
                      ng->n,ng->bin_fp,"lin_disc_ratio");
            break;
        case ABSOLUTE:
            rr_fwrite((char*)ng->abs_disc_const,sizeof(double),
                      ng->n,ng->bin_fp,"abs_disc_const");
            break;
    }
    
    /* Tree information */
    
    /* Unigram stuff first, since can be dumped all in one go */
    
    rr_fwrite((char*)ng->num_kgrams,sizeof(ngram_sz_t),ng->n,ng->bin_fp,"num_kgrams");
    
    if (ng->four_byte_counts)
        rr_fwrite((char*)ng->count4[0],sizeof(count_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram counts");
    else 
        rr_fwrite((char*)ng->count[0],sizeof(count_ind_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram counts");
    
    if (ng->four_byte_alphas)
        rr_fwrite((char*)ng->bo_weight4[0],sizeof(four_byte_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram backoff weights");
    else
        rr_fwrite((char*)ng->bo_weight[0],sizeof(bo_weight_t),ng->vocab_size+1,
                  ng->bin_fp,"unigram backoff weights");
    
    if (ng->n > 1) 
        rr_fwrite((char*)ng->ind[0],sizeof(index__t),ng->vocab_size+1,
                  ng->bin_fp,"unigram -> bigram pointers");
    
    /* Write the rest of the tree structure in chunks, otherwise the
     kernel buffers are too big. */
    
    /* Need to do byte swapping */
    swap_struct(ng);
    
    
    for (i=1;i<=ng->n-1;i++) {
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i]) 
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            rr_fwrite((char*)&ng->word_id[i][from_rec],1,sizeof(id__t)*l_chunk,ng->bin_fp,"word ids");
            
            from_rec += l_chunk;
        }   
    }
    
    for (i=1;i<=ng->n-1;i++) {
        
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i])
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            if (ng->four_byte_counts)
                rr_fwrite((char*)&ng->count4[i][from_rec],1,sizeof(count_t)*l_chunk,ng->bin_fp,"counts");
            else
                rr_fwrite((char*)&ng->count[i][from_rec],1,sizeof(count_ind_t)*l_chunk,ng->bin_fp,"counts");
            
            from_rec += l_chunk;
        }    
    }
    
    for (i=1;i<=ng->n-2;i++) {
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i]) 
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            if (ng->four_byte_alphas)
                rr_fwrite((char*)&ng->bo_weight4[i][from_rec],1,sizeof(four_byte_t)*l_chunk,
                          ng->bin_fp,"backoff weights");
            else
                rr_fwrite((char*)&ng->bo_weight[i][from_rec],1,sizeof(bo_weight_t)*l_chunk,
                          ng->bin_fp,"backoff weights");
            from_rec += l_chunk;
        }
    }
    
    for (i=1;i<=ng->n-2;i++) {
        from_rec = 0;
        l_chunk = 100000;
        while(from_rec < ng->num_kgrams[i]) {
            if (from_rec+l_chunk > ng->num_kgrams[i])
                l_chunk = ng->num_kgrams[i] - from_rec;
            
            rr_fwrite((char*)&ng->ind[i][from_rec],1,sizeof(index__t)*l_chunk,ng->bin_fp,
                      "indices");
            from_rec += l_chunk;
        }
    }
    
    rr_oclose(ng->bin_fp);
    
    /* Swap back */
    swap_struct(ng); 
}
Пример #21
0
/* To make this function less dependent on input stream, just pull records out and create an interface for it
 */
int wfreq2vocab_impl(FILE* ifp, FILE* ofp, int cutoff, int vocab_size, int num_recs, int verbosity)
{
  flag gt_set;
  flag top_set;
  int current_rec;
  int num_above_threshold;
  int num_to_output;
  int i;
  word_rec *records;
  char temp_word[750];

  gt_set = (cutoff != -1);
  top_set = (vocab_size != -1);
  if(cutoff==-1) cutoff=0;
  if(vocab_size==-1) vocab_size=0;

  if (gt_set && top_set) 
    quit(-1,"wfreq2vocab : Error : Can't use both the -top and the -gt options.\n");

  if (!gt_set && !top_set) 
    vocab_size = 20000;

  if (gt_set) 
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing all words which\n              occurred more that %d times. Reading wfreq stream from stdin...\n",cutoff);
  else 
    pc_message(verbosity,2,"wfreq2vocab : Will generate a vocabulary containing the most\n              frequent %d words. Reading wfreq stream from stdin...\n",vocab_size);

  current_rec = 0;
  num_above_threshold = 0;

  records = (word_rec *) rr_malloc(sizeof(word_rec)*num_recs);

  while (!rr_feof(ifp)) {
    if (fscanf(ifp, "%s %d",temp_word,&(records[current_rec].count)) != 2) {
      if (!rr_feof(ifp)) 
	quit(-1,"Error reading unigram counts from standard input.\n");

    }else {
      records[current_rec].word = salloc(temp_word);
      if (gt_set && records[current_rec].count > cutoff) 
	num_above_threshold++;

      current_rec++;
    }

    if(current_rec > num_recs ){
      quit2(-1,"The number of records %d reach the user-defined limit %d, consider to increase the number of records by -records\n",current_rec,num_recs);
    }
  }

  /* Sort records in descending order of count */

  qsort((void*) records,(size_t) current_rec, sizeof(word_rec),sort_by_count);

  if (gt_set) 
    num_to_output = num_above_threshold;
  else 
    num_to_output = vocab_size;

  if (current_rec<num_to_output) 
    num_to_output = current_rec;

  /* Now sort the relevant records alphabetically */

  qsort((void*) records,(size_t) num_to_output, sizeof(word_rec),sort_alpha);

  if (gt_set) 
    pc_message(verbosity,2,"Size of vocabulary = %d\n",num_to_output);

  if (num_to_output>MAX_UNIGRAM) {
    pc_message(verbosity,1,"Warning : Vocab size exceeds %d. This might cause problems with \n",MAX_UNIGRAM);
    pc_message(verbosity,1,"other tools, since word id's are stored in 2 bytes.\n");
  }

  if (num_to_output == 0) 
    pc_message(verbosity,1,"Warning : Vocab size = 0.\n");
  /* Print the vocab to stdout */
  
  printf("## Vocab generated by v2 of the CMU-Cambridge Statistcal\n");
  printf("## Language Modeling toolkit.\n");
  printf("##\n");
  printf("## Includes %d words ",num_to_output);
  printf("##\n");

  for (i=0;i<=num_to_output-1;i++) 
    fprintf(ofp,"%s\n",records[i].word);

  pc_message(verbosity,0,"wfreq2vocab : Done.\n");

  return 0;
}
Пример #22
0
void compute_unigram(ng_t *ng,int verbosity) {

    int i;
    int count;
    int n_zerotons;
    int num_of_types;
    double floatN;
    double prob;
    double total_prob;
    double discount_mass;
    double total_zeroton_mass;
    double prob_zeroton;
    double prob_singleton;
    double leftover_mass;

    /* Make sure that we don't have a type 2 vocab and an UNK */

    if (ng->vocab_type==OPEN_VOCAB_2 && return_count(ng->four_byte_counts,
            ng->count_table[0],
            ng->count[0],
            ng->count4[0],
            0) != 0) {
        quit(-1,"Error : Open vocabulary type 2 requested, but there were OOVs in the \ntraining data.\n");
    }

    if (ng->vocab_type == CLOSED_VOCAB) {
        ng->uni_probs[0] = 1e-99;
    }

    /* Make sure all context cues have a zero count */

    if (ng->no_of_ccs > 0) {
        for (i=ng->first_id; i<=ng->vocab_size; i++) {
            if (ng->context_cue[i] && return_count(ng->four_byte_counts,
                                                   ng->count_table[0],
                                                   ng->count[0],
                                                   ng->count4[0],
                                                   i) != 0) {
                quit(-1,"Error : Context cue word has a non zero count.\n");
            }
        }
    }

    /* Compute the discounted unigram, and the total */

    floatN = (double) ng->n_unigrams;

    total_prob = 0.0;

    num_of_types = 0;

    for (i=ng->first_id; i<=ng->vocab_size; i++) {
        if (return_count(ng->four_byte_counts,
                         ng->count_table[0],
                         ng->count[0],
                         ng->count4[0],
                         i) > 0) {
            num_of_types++;
        }
    }


    for (i=ng->first_id; i<=ng->vocab_size; i++) {

        count = return_count(ng->four_byte_counts,
                             ng->count_table[0],
                             ng->count[0],
                             ng->count4[0],
                             i);
        prob = count/floatN;
        switch (ng->discounting_method) {
        case GOOD_TURING:
            if (count > 0 && count <= ng->disc_range[0]) {
                prob *= ng->gt_disc_ratio[0][count];
            }
            else {
                if (count == 0) {
                    prob = 1e-99;
                }
            }
            break;
        case LINEAR:
            if (count > 0) {
                prob *= ng->lin_disc_ratio[0];
            }
            else {
                prob = 1e-99;
            }
            break;
        case ABSOLUTE:
            if (count > 0) {
                prob *= (count - ng->abs_disc_const[0])/count;
            }
            else {
                prob = 1e-99;
            }
            break;
        case WITTEN_BELL:
            if (count > 0) {
                prob *= floatN/(floatN+num_of_types);
            }
            else {
                prob = 1e-99;
            }
            break;
        }
        pc_message(verbosity,4,"   prob[%d] = %.8g count = %d \n",i,prob,count);
        ng->uni_probs[i] = prob;
        total_prob += prob;
    }

    /* Compute the discount mass */

    discount_mass = 1.0 - total_prob;

    pc_message(verbosity,2,"Unigrams's discount mass is %g (n1/N = %g)\n",
               discount_mass,ng->freq_of_freq[0][1]/floatN);

    if (discount_mass < 1e-10 && discount_mass != 0.0) {
        discount_mass = 0.0;
        pc_message(verbosity,2,"Discount mass was rounded to zero.\n");
    }

    /* Compute P(zeroton) & assign it to all zerotons (except context
       cues) */

    leftover_mass = discount_mass;
    n_zerotons = ng->freq_of_freq[0][0] - ng->no_of_ccs;

    if ((n_zerotons > 0) && (discount_mass > 0.0)) {
        total_zeroton_mass = discount_mass;
        if (ng->vocab_type == OPEN_VOCAB_2) {
            total_zeroton_mass = (1.0 - ng->oov_fraction)*discount_mass;
        }
        prob_zeroton = total_zeroton_mass / n_zerotons;
        prob_singleton = 1 / floatN;
        switch (ng->discounting_method) {
        case GOOD_TURING:
            if (ng->disc_range[0] >= 1) {
                prob_singleton *= ng->gt_disc_ratio[0][1];
            }
            break;
        case LINEAR:
            prob_singleton *= ng->lin_disc_ratio[0];
            break;
        case ABSOLUTE:
            prob_singleton *= (1-ng->abs_disc_const[0]);
            break;
        case WITTEN_BELL:
            prob_singleton *= floatN/(floatN + num_of_types);
            break;
        }
        pc_message(verbosity,2,"%d zerotons, P(zeroton) = %g P(singleton) = %g\n",
                   n_zerotons,prob_zeroton,prob_singleton);
        if (prob_zeroton > ng->zeroton_fraction*prob_singleton) {
            prob_zeroton = ng->zeroton_fraction*prob_singleton;
            pc_message(verbosity,1,"P(zeroton) was reduced to %.10f (%.3f of P(singleton))\n",prob_zeroton,ng->zeroton_fraction);
        }

        for (i=ng->first_id; i<=ng->vocab_size; i++) {
            if ((return_count(ng->four_byte_counts,
                              ng->count_table[0],
                              ng->count[0],
                              ng->count4[0],
                              i) == 0) && (!ng->context_cue[i])) {
                ng->uni_probs[i] = prob_zeroton;
            }
        }

        total_zeroton_mass = n_zerotons * prob_zeroton;
        leftover_mass = discount_mass - total_zeroton_mass;
    }

    /* Do renormalisation due to UNK */

    if (ng->vocab_type == OPEN_VOCAB_2) {
        ng->uni_probs[0] += leftover_mass;
        if (ng->uni_probs[0] <= 0.0) {
            ng->uni_probs[0] = 1e-99;
        }
    }
    else {
        if (fabs(leftover_mass) > 1e-10) {
            for (i=ng->first_id; i<=ng->vocab_size; i++) {
                ng->uni_probs[i] /= (1.0 - leftover_mass);
            }
            if (fabs(leftover_mass)>1e-8) {
                pc_message(verbosity,1,"Unigram was renormalized to absorb a mass of %g\n",leftover_mass);
            }
        }
    }
    pc_message(verbosity,1,"prob[UNK] = %g\n",ng->uni_probs[0]);
    if ((n_zerotons>0) && (discount_mass<=0.0)) {
        pc_message(verbosity,1,"WARNING: %d non-context-cue words have zero probability\n\n",n_zerotons);
    }
    if (verbosity>=4) {
        fprintf(stderr,"THE FINAL UNIGRAM:\n");
        for (i=ng->first_id; i<=ng->vocab_size; i++) {
            fprintf(stderr," unigram[%d]=%g\n",i,ng->uni_probs[i]);
        }
    }

    /* Test resulting unigram for consistency */

    total_prob = 0.0;
    for (i=ng->first_id; i<=ng->vocab_size; i++) {
        total_prob += ng->uni_probs[i];
    }
    if (fabs(1.0-total_prob) > 1e-6) {
        quit(-1,"ERROR: sum[P(w)] = %.10f\n",total_prob);
    }
    if (fabs(1.0-total_prob) > 1e-9) {
        pc_message(verbosity,1,"WARNING: sum[P(w)] = %.10f\n\n",total_prob);
    }

    /* Precompute logprobs */

    for (i=ng->first_id; i<=ng->vocab_size; i++) {
        ng->uni_log_probs[i] = log(ng->uni_probs[i]);
    }

}
Пример #23
0
void merge_tempfiles (int start_file, 
		      int end_file, 
		      char *temp_file_root,
		      char *temp_file_ext,
		      int max_files,
		      FILE *outfile,
		      int n,
		      int verbosity) {

  FILE *new_temp_file;
  char *new_temp_filename;
  
  FILE **temp_file;
  char **temp_filename;
  char **current_ngram;
  char smallest_ngram[1000];
  int *current_ngram_count;
  flag *finished;
  flag all_finished;
  int temp_count;
  char temp_word[500];
  int i,j;
  
  pc_message(verbosity,2,"Merging temp files %d through %d...\n", start_file,
 	  end_file);
   /*
    * If we try to do more than max_files, then merge into groups,
    * then merge groups recursively.
    */
    if (end_file-start_file+1 > max_files) {
       int new_start_file, new_end_file;
       int n_file_groups = 1 + (end_file-start_file)/max_files;
 
       fprintf(stderr, "%d files to do, in %d groups\n", end_file-start_file,
 	      n_file_groups);
 
       new_temp_filename = (char *) rr_malloc(300*sizeof(char));
 
       /*
        * These n_file_groups sets of files will be done in groups of
        * max_files batches each, as temp files numbered
        * end_file+1 ... end_file+n_file_groups,
        * and then these will be merged into the final result.
        */
 
       for (i = 0; i < n_file_groups; i++) {
 	  /* do files i*max_files through min((i+1)*max_files-1,end_file); */
 	  new_start_file = start_file + (i*max_files);
 	  new_end_file = start_file + ((i+1)*max_files) - 1;
 	  if (new_end_file > end_file) new_end_file = end_file;
 	  
 	  sprintf(new_temp_filename,
 		  "%s/%hu%s",
 		  temp_file_root,
 		  end_file+i+1,
 		  temp_file_ext);
 
 	  new_temp_file = rr_oopen(new_temp_filename);
 
 	  merge_tempfiles(new_start_file,
 			  new_end_file,
 			  temp_file_root,
			  temp_file_ext,
 			  max_files,
 			  new_temp_file,
 			  n,
			  verbosity);
 
 	  rr_iclose(new_temp_file);
 
       }
 
       merge_tempfiles(end_file+1,
		       end_file+n_file_groups,
		       temp_file_root,
		       temp_file_ext,
		       max_files,
		       outfile,
		       n,
		       verbosity);
 
       return;
    }
    
   /*
    * We know we are now doing <= max_files.
    */
 
   temp_file = (FILE **) rr_malloc((end_file+1)*sizeof(FILE *));
   temp_filename = (char **) rr_malloc((end_file+1)*sizeof(char *));
   for (i=start_file;i<=end_file;i++) {
     temp_filename[i] = (char *) rr_malloc(300*sizeof(char));
   }
   current_ngram = (char **) rr_malloc((end_file+1)*sizeof(char *));
   for (i=start_file;i<=end_file;i++) {
     current_ngram[i] = (char *) rr_malloc(1000*sizeof(char));
   }
   current_ngram_count = (int *) rr_malloc((end_file+1)*sizeof(int));
   finished = (flag *) rr_malloc(sizeof(flag)*(end_file+1));
  
   /* Open all the temp files for reading */
   for (i=start_file;i<=end_file;i++) {
     sprintf(temp_filename[i],"%s/%hu%s",
	     temp_file_root,i,temp_file_ext);
     temp_file[i] = rr_iopen(temp_filename[i]);
   }
 
   /* Now go through the files simultaneously, and write out the appropriate
      ngram counts to the output file. */
 
   for (i=start_file;i<=end_file;i++) {
     finished[i] = 0;
     if (!rr_feof(temp_file[i])) {
       for (j=0;j<=n-1;j++) {
 	if (fscanf(temp_file[i],"%s",temp_word) != 1) {
 	  if (!rr_feof(temp_file[i]))
 	    quit(-1,"Error reading temp file %s\n",temp_filename[i]);
 	}else {
 	  if (j==0)
 	    strcpy(current_ngram[i],temp_word);
  	  else {
 	    strcat(current_ngram[i]," ");
 	    strcat(current_ngram[i],temp_word);
  	  }
  	}
       }
       if (fscanf(temp_file[i],"%d",&current_ngram_count[i]) != 1) {
	 if (!rr_feof(temp_file[i]))
	   quit(-1,"Error reading temp file %s\n",temp_filename[i]);
       }
     }
   }
   
   all_finished = 0;
   
   while (!all_finished) {
  
     /* Find the smallest current ngram */
 
     strcpy(smallest_ngram,"");
 
     for (i=start_file;i<=end_file;i++) {
       if (!finished[i]) {
	 if (strcmp(smallest_ngram,current_ngram[i]) > 0 ||
	     (smallest_ngram[0] == '\0'))
	   strcpy(smallest_ngram,current_ngram[i]);
       }
     }
     
     /* For each of the files that are currently holding this ngram,
        add its count to the temporary count, and read in a new ngram
        from the files. */
  
     temp_count = 0;
 
     for (i=start_file;i<=end_file;i++) {
       if (!finished[i]) {
 	if (!strcmp(smallest_ngram,current_ngram[i])) {
 	  temp_count += current_ngram_count[i];
 	  if (!rr_feof(temp_file[i])) {
 	    for (j=0;j<=n-1;j++) {
 	      if (fscanf(temp_file[i],"%s",temp_word) != 1) {
 		if (!rr_feof(temp_file[i])) {
 		  quit(-1,"Error reading temp file %s\n",temp_filename[i]);
 		}
 	      }else {
 		if (j==0)
 		  strcpy(current_ngram[i],temp_word);
  		else {
 		  strcat(current_ngram[i]," ");
 		  strcat(current_ngram[i],temp_word);
  		}
  	      }
 	    }
 	    if (fscanf(temp_file[i],"%d",&current_ngram_count[i]) != 1) {
 	      if (!rr_feof(temp_file[i])) {
 		quit(-1,"Error reading temp file count %s\n",
 		     temp_filename[i]);
  	      }
  	    }
 	  }
 
 	  /*
 	   * PWP: Note that the fscanf may have changed the state of
 	   * temp_file[i], so we re-ask the question rather than just
 	   * doing an "else".
 	   */
 	  if (rr_feof(temp_file[i])) {
 	    finished[i] = 1;
 	    all_finished = 1;
 	    for (j=start_file;j<=end_file;j++) {
 	      if (!finished[j]) {
 		all_finished = 0;
  	      }
  	    }
  	  }
  	}
        }
      }
 
     /*
      * PWP: We cannot conditionalize this on (!all_finished) because
      * if we do we may have lost the very last count.  (Consider the
      * case when several files have ran out of data, but the last
      * couple have the last count in them.)
      */
     if (fprintf(outfile,"%s %d\n",smallest_ngram,temp_count) < 0) {
       quit(-1,"Write error encountered while attempting to merge temporary files.\nAborting, but keeping temporary files.\n");
     }
   }
 
   for (i=start_file;i<=end_file;i++) {
     rr_iclose(temp_file[i]);
     remove(temp_filename[i]);
   }
    
   free(temp_file);
   for (i=start_file;i<=end_file;i++) {
      free(temp_filename[i]);
    }
   free(temp_filename);  

   for (i=start_file;i<=end_file;i++) {
      free(current_ngram[i]);
   }
   free(current_ngram);

  free(current_ngram_count);
  free(finished);
}