Example #1
0
void main(int argc, char *argv[]) {

  int i,j;

  char *vocab_filename;
  FILE *tempfile;
  char tempfiles_directory[1000];
  int vocab_size;
  FILE *vocab_file;

  int verbosity;

  int buffer_size;
  int position_in_buffer;
  int number_of_tempfiles;
  int max_files;
  int fof_size;

  unsigned short *buffer;
  unsigned short *placeholder;
  unsigned short *temp_ngram;
  int temp_count;
  
  char temp_word[500];
  char temp_word2[500];

  char *temp_file_root;
  char *temp_file_ext;
  char *host_name;
  int proc_id;
  struct utsname uname_info;

  flag write_ascii;

  /* Vocab hash table things */

  struct hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  tempfile = NULL; /* Just to prevent compilation warnings. */

  report_version(&argc,argv);

  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);

  /* Process command line */
  
  if (pc_flagarg( &argc, argv,"-help") || argc==1) {
    fprintf(stderr,"text2idngram - Convert a text stream to an id n-gram stream.\n");
    fprintf(stderr,"Usage : text2idngram  -vocab .vocab \n");
    fprintf(stderr,"                    [ -buffer 100 ]\n");
    fprintf(stderr,"                    [ -hash %d ]\n",DEFAULT_HASH_SIZE);
    fprintf(stderr,"                    [ -temp %s ]\n",DEFAULT_TEMP);
    fprintf(stderr,"                    [ -files %d ]\n",DEFAULT_MAX_FILES);
    fprintf(stderr,"                    [ -gzip | -compress ]\n");
    fprintf(stderr,"                    [ -verbosity %d ]\n",
	    DEFAULT_VERBOSITY);
    fprintf(stderr,"                    [ -n 3 ]\n");
    fprintf(stderr,"                    [ -write_ascii ]\n");
    fprintf(stderr,"                    [ -fof_size 10 ]\n");
    exit(1);
  }

  pc_message(verbosity,2,"text2idngram\n");

  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);

  placeholder = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  temp_ngram = (unsigned short *) rr_malloc(sizeof(unsigned short)*n);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);

  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");

  fof_size = pc_intarg(&argc,argv,"-fof_size",10);

  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);

  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  
  if (!strcmp("",vocab_filename)) {
    quit(-1,"text2idngram : Error : Must specify a vocabulary file.\n");
  }
    
  strcpy(tempfiles_directory,pc_stringarg( &argc, argv, "-temp", 
					   DEFAULT_TEMP));

  if (pc_flagarg(&argc,argv,"-compress")) {
    temp_file_ext = salloc(".Z");
  }
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) {
      temp_file_ext = salloc(".gz");
    }
    else {
      temp_file_ext = salloc("");
    }
  }

  uname(&uname_info);

  host_name = salloc(uname_info.nodename);

  proc_id = getpid();

  sprintf(temp_word,"%s%s.%d.",TEMP_FILE_ROOT,host_name,proc_id);

  temp_file_root = salloc(temp_word);

  pc_report_unk_args(&argc,argv,verbosity);
  
  /* If the last charactor in the directory name isn't a / then add one. */
  
  if (tempfiles_directory[strlen(tempfiles_directory)-1] != '/') {
    strcat(tempfiles_directory,"/");
  }
  
  pc_message(verbosity,2,"Vocab                  : %s\n",vocab_filename);
  pc_message(verbosity,2,"N-gram buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size        : %d\n",hash_size);
  pc_message(verbosity,2,"Temp directory         : %s\n",tempfiles_directory);
  pc_message(verbosity,2,"Max open files         : %d\n",max_files);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  
  pc_message(verbosity,2,"n                      : %d\n",n);

  buffer_size *= (1000000/(sizeof(unsigned short)*n));

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) {
      fprintf(stderr,"======================================================\n");
      fprintf(stderr,"WARNING: word %s is repeated in the vocabulary.\n",temp_word);
      fprintf(stderr,"=======================================================\n");
    }
    if (strncmp(temp_word,"#",1)==0) {
      fprintf(stderr,"\n\n===========================================================\n");
      fprintf(stderr,":\nWARNING: line assumed NOT a comment:\n");
      fprintf(stderr,     ">>> %s <<<\n",temp_word);
      fprintf(stderr,     "         '%s' will be included in the vocabulary.\n",temp_word2);
      fprintf(stderr,     "         (comments must start with '##')\n");
      fprintf(stderr,"===========================================================\n\n");
    }
    vocab_size++;
    add_to_hashtable(&vocabulary,hash(temp_word2,M),temp_word2,vocab_size);
  }

  if (vocab_size > MAX_VOCAB_SIZE) {
    quit(-1,"text2idngram : Error : Vocabulary size exceeds maximum.\n");
  }   
  
  pc_message(verbosity,2,"Allocating memory for the n-gram buffer...\n");

  buffer=(unsigned short*) rr_malloc(n*(buffer_size+1)*sizeof(unsigned short));

  number_of_tempfiles = 0;

  /* Read text into buffer */

  /* Read in the first ngram */

  position_in_buffer = 0;

  for (i=0;i<=n-1;i++) {
    get_word(stdin,temp_word);
    add_to_buffer(index2(&vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(stdin)) {

    /* Fill up the buffer */

    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");
    while ((position_in_buffer<buffer_size) && (!rr_feof(stdin))) {
      position_in_buffer++;
      if (position_in_buffer % 20000 == 0) {
	if (position_in_buffer % 1000000 == 0) {
	  pc_message(verbosity,2,".\n");
	}
	else {
	  pc_message(verbosity,2,".");
	}
      }
      for (i=1;i<=n-1;i++) {
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      }
      if (get_word(stdin,temp_word) == 1) {
	add_to_buffer(index2(&vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) {
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);
    }

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");
    
    qsort((void*) buffer,(size_t) position_in_buffer,
	  n*sizeof(unsigned short),compare_ngrams);

    /* Output the buffer to temporary BINARY file */
    
    number_of_tempfiles++;

    sprintf(temp_word,"%s%s%hu%s",tempfiles_directory,temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    tempfile = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
      if (temp_ngram[i] > MAX_VOCAB_SIZE) {
	quit(-1,"Invalid trigram in buffer.\nAborting");

      }
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {
 
      if (!compare_ngrams(temp_ngram,&buffer[i*n])) {
	temp_count++;
      }
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite(&temp_ngram[j],sizeof(unsigned short),1,
		    tempfile,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite(&temp_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	temp_count = 1;
      }
    }
    
    rr_oclose(tempfile);

    for (i=0;i<=n-1;i++) {
      add_to_buffer(placeholder[i],0,i,buffer);
    }

    position_in_buffer = 0;

  }

  /* Merge the temporary files, and output the result to standard output */

  pc_message(verbosity,2,"Merging temporary files...\n");
  
  merge_tempfiles(1,
		  number_of_tempfiles,
		  temp_file_root,
		  temp_file_ext,
		  max_files,
		  tempfiles_directory,
		  stdout,
		  write_ascii,
		  fof_size); 

  pc_message(verbosity,0,"text2idngram : Done.\n");

  exit(0);
  
}
Example #2
0
/*
  @return number_of_tempfiles
 */
int  read_txt2ngram_buffer(FILE* infp, 
			   struct idngram_hash_table *vocabulary, 
			   int32 verbosity,
			   wordid_t *buffer,
			   int buffer_size,
			   unsigned int n,
			   char* temp_file_root,
			   char* temp_file_ext,
			   FILE* temp_file
			   )
{
  /* Read text into buffer */
  char temp_word[MAX_WORD_LENGTH];
  int position_in_buffer;
  int number_of_tempfiles;
  unsigned int i,j;
  wordid_t *placeholder;
  wordid_t *temp_ngram;
  int temp_count;

#if 1
  int tmpval;
#endif

  temp_ngram  = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);
  placeholder = (wordid_t *) rr_malloc(sizeof(wordid_t)*n);

  ng=n;

  position_in_buffer = 0;
  number_of_tempfiles = 0;

  //tk: looks like things may croak if the corpus has less than n words
  //not that such a corpus would be useful anyway
  for (i=0;i<=n-1;i++) {
    get_word(infp,temp_word);
    /*
        fprintf(stderr,"%s \n",temp_word);
	fprintf(stderr,"%d \n",index2(vocabulary,temp_word));
        fflush(stderr);
    */
    add_to_buffer(index2(vocabulary,temp_word),0,i,buffer);
  }

  while (!rr_feof(infp)) {
    /* Fill up the buffer */
    pc_message(verbosity,2,"Reading text into the n-gram buffer...\n");
    pc_message(verbosity,2,"20,000 n-grams processed for each \".\", 1,000,000 for each line.\n");

    while ((position_in_buffer<buffer_size) && (!rr_feof(infp))) {
      position_in_buffer++;
      show_idngram_nlines(position_in_buffer,verbosity);

      for (i=1;i<=n-1;i++) 
	add_to_buffer(buffer_contents(position_in_buffer-1,i,buffer),
		      position_in_buffer,i-1,buffer);
      
      if (get_word(infp,temp_word) == 1) {
      /*
	fprintf(stderr,"%s \n",temp_word);
	fprintf(stderr,"%d \n",index2(vocabulary,temp_word));
	fflush(stderr);
      */
	add_to_buffer(index2(vocabulary,temp_word),position_in_buffer,
		      n-1,buffer);
      }
    }

    for (i=0;i<=n-1;i++) 
      placeholder[i] = buffer_contents(position_in_buffer,i,buffer);

    /* Sort buffer */
    
    pc_message(verbosity,2,"\nSorting n-grams...\n");    
    
    qsort((void*) buffer,(size_t) position_in_buffer,n*sizeof(wordid_t),compare_ngrams);

    /* Output the buffer to temporary BINARY file */    
    number_of_tempfiles++;

    sprintf(temp_word,"%s/%hu%s",temp_file_root,
	    number_of_tempfiles,temp_file_ext);

    pc_message(verbosity,2,"Writing sorted n-grams to temporary file %s\n",
	       temp_word);

    temp_file = rr_oopen(temp_word);

    for (i=0;i<=n-1;i++) {
      temp_ngram[i] = buffer_contents(0,i,buffer);
#if MAX_VOCAB_SIZE < 65535
      /* This check is well-meaning but completely useless since
	 buffer_contents() can never return something greater than
	 MAX_VOCAB_SIZE (dhuggins@cs, 2006-03) */
      if (temp_ngram[i] > MAX_VOCAB_SIZE)
	quit(-1,"Invalid trigram in buffer.\nAborting");
#endif
    }
    temp_count = 1;

    for (i=1;i<=position_in_buffer;i++) {

      tmpval=compare_ngrams(temp_ngram,&buffer[i*n]);

      /*      for(k=0;k<=n-1;k++){
	fprintf(stderr, "tmpval: %d k %d, temp_ngram %d, &buffer[i*n] %d\n",tmpval, k, temp_ngram[k], (&buffer[i*n])[k]);
	}*/

      if (!compare_ngrams(temp_ngram,&buffer[i*n])) 
	temp_count++;
      else {
	/*	printf("Have been here?\n");*/
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*) &temp_ngram[j],sizeof(wordid_t),1,
		    temp_file,"temporary n-gram ids");
	  temp_ngram[j] = buffer_contents(i,j,buffer);
	}
	rr_fwrite((char*)&temp_count,sizeof(int),1,temp_file,
		  "temporary n-gram counts");

	/*	for(j=0 ; j<=n-1;j++)
	  fprintf(stderr,"%d ",temp_ngram[j]);
	  fprintf(stderr,"%d\n",temp_count);*/

	temp_count = 1;
      }
    }
    
    rr_oclose(temp_file);

    for (i=0;i<=n-1;i++) 
      add_to_buffer(placeholder[i],0,i,buffer);

    position_in_buffer = 0;

  }

  return number_of_tempfiles;
}