Пример #1
0
FILE *rr_iopen(char *path)
{
  static char rname[]="rr_iopen";
  FILE *fp;
  char pipe[256], is_Z;
  size_t lpath;

  if (strcmp(path,"-")==0) return(stdin);

  lpath = strlen(path);
  if (lpath > sizeof(pipe) - strlen("cat | gunzip ") - 4)
    quit(-1,"%s: pathname '%s' is too long\n",rname,path);

  if (strcmp(&path[lpath-2],".Z")==0) {
     /* popen() does not report error if file doesn't exist, so: */
     if (!rr_fexists(path)) quit(-1,"%s: file '%s' not found\n",rname,path);
     sprintf(pipe,"zcat %s",path);
     goto Z;
  }

  else if (strcmp(&path[lpath-3],".gz")==0) {
     /* popen() does not report error if file doesn't exist, so: */
     if (!rr_fexists(path)) quit(-1,"%s: file '%s' not found\n",rname,path);
     sprintf(pipe,"cat %s | gunzip",path);
     goto Z;
  }

  else if (!rr_fexists(path)) {
     sprintf(pipe,"%s.Z",path);
     /* popen() does not report error if file doesn't exist, so: */
     if (!rr_fexists(pipe)) {
       sprintf(pipe,"%s.gz",path);
       if (!rr_fexists(pipe)) {
	 quit(-1,"%s: None of '%s' '%s.Z' or '%s.gz' exist.\n",rname,path,path,path);
       }
       sprintf(pipe,"cat %s.gz | gunzip",path);
       goto Z;
     }
     sprintf(pipe,"zcat %s.Z",path);
     goto Z;
  }
  else {
     fp = rr_fopen(path,"rb");
     is_Z = 0;
     goto record;
  }

Z:
  fp = popen(pipe,"r");
  if (!fp) quit(-1,"%s: problems opening the pipe '%s' for input.\n", rname,pipe);
  is_Z = 1;

record:
  if (fileno(fp) > sizeof(RRi_is_Z)-1) quit(-1,"%s: fileno = %d is too large\n",rname,fileno(fp));
  RRi_is_Z[fileno(fp)] = is_Z;

  return(fp);
}
Пример #2
0
FILE *rr_oopen(char *path)
{
  static char rname[]="rr_oopen";
  FILE *fp;
  char pipe[256], is_Z;
  int  lpath;

  if (strcmp(path,"-")==0) return(stdout);

  lpath = strlen(path);
  if (strcmp(&path[lpath-2],".Z")==0) {
    if (lpath > sizeof(pipe) - strlen("compress >!  ") - 4)
      quit(-1,"%s: pathname '%s' is too long\n",rname,path);
     sprintf(pipe,"compress > %s",path);
     fp = popen(pipe,"wb");
     if (!fp) quit(-1,"%s: problems opening the pipe '%s' for output.\n", rname,pipe);
     is_Z = 1;
  }
  else {
    if (strcmp(&path[lpath-3],".gz")==0) {
      if (lpath > sizeof(pipe) - strlen("gzip >!  ") -4)
	quit(-1,"%s: pathname '%s' is too long\n",rname,path);
      sprintf(pipe,"gzip > %s",path);
      fp = popen(pipe,"wb");
      if (!fp) quit(-1,"%s: problems opening the pipe '%s' for output.\n", rname,pipe);
      is_Z = 1;
    }
    else {
      fp = rr_fopen(path,"wb");
      is_Z = 0;
    }
  }

  if (fileno(fp) > sizeof(RRo_is_Z)-1) quit(-1,"%s: fileno = %d is too large\n",rname,fileno(fp));
  RRo_is_Z[fileno(fp)] = is_Z;

  return(fp);
}
Пример #3
0
int main(int argc, char *argv[]) {

  int verbosity;
  int vocab_size;
  FILE *vocab_file;
  int buffer_size;
  flag write_ascii;
  int max_files;
  int number_of_tempfiles;
  char *vocab_filename;
  char *idngram_filename;
  char temp_word[MAX_WORD_LENGTH];
  char temp_word2[MAX_WORD_LENGTH];
  char temp_word3[MAX_WORD_LENGTH];
  flag contains_unks;
  int position_in_buffer;
  FILE *outfile;
  FILE *tempfile;
  FILE *non_unk_fp;
  ngram_rec *buffer;
  flag same_ngram;
  int i;
  int j;
  int fof_size;
  int size_of_rec;

  char temp_directory[1000];
  char *temp_file_ext;

  /* Vocab hash table things */

  struct idngram_hash_table vocabulary;
  unsigned long hash_size;
  unsigned long M;

  wordid_t *current_ngram;
  int current_count;
  wordid_t *sort_ngram;
  int sort_count;
  
  /* Process command line */

  report_version(&argc,argv);
  
  if (argc == 1 || pc_flagarg(&argc, argv,"-help")) {    
    /* Display help message */    
    help_message();
    exit(1);
  }


  n = pc_intarg( &argc, argv, "-n",DEFAULT_N);
  hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE);
  buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM);
  write_ascii = pc_flagarg(&argc,argv,"-write_ascii");
  verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY);
  max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES);
  fof_size = pc_intarg(&argc,argv,"-fof_size",10);
  vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" ));
  idngram_filename = salloc(pc_stringarg( &argc, argv, "-idngram", "" ));
  
  if (!strcmp("",vocab_filename)) 
    quit(-1,"Error : Must specify a vocabulary file.\n");

  if (!strcmp("",idngram_filename)) 
    quit(-1,"text2idngram : Error : Must specify idngram file.\n");
    
  if (pc_flagarg(&argc,argv,"-compress")) 
    temp_file_ext = salloc(".Z");
  else {
    if (pc_flagarg(&argc,argv,"-gzip")) 
      temp_file_ext = salloc(".gz");
    else 
      temp_file_ext = salloc("");
  }

  strcpy(temp_directory, "cmuclmtk-XXXXXX");
  if (mkdtemp(temp_directory) == NULL) {
     quit(-1, "Failed to create temporary folder: %s\n", strerror(errno));
  }

  pc_report_unk_args(&argc,argv,verbosity);

  outfile = rr_fopen(idngram_filename,"wb");
  
  pc_message(verbosity,2,"Vocab           : %s\n",vocab_filename);
  pc_message(verbosity,2,"Output idngram  : %s\n",idngram_filename);
  pc_message(verbosity,2,"Buffer size     : %d\n",buffer_size);
  pc_message(verbosity,2,"Hash table size : %d\n",hash_size);
  pc_message(verbosity,2,"Max open files  : %d\n",max_files);
  pc_message(verbosity,2,"n               : %d\n",n);
  pc_message(verbosity,2,"FOF size               : %d\n",fof_size);  

  size_of_rec = (sizeof(wordid_t) * n) + 16 - (( n* sizeof(wordid_t)) % 16);
  buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec)));
  fprintf(stderr,"buffer size = %d\n",buffer_size);

  /* Allocate memory for hash table */

  fprintf(stderr,"Initialising hash table...\n");

  M = nearest_prime(hash_size);

  new_idngram_hashtable(&vocabulary,M);

  /* Read in the vocabulary */

  vocab_size = 0;

  vocab_file = rr_iopen(vocab_filename);

  pc_message(verbosity,2,"Reading vocabulary...\n");

  while (fgets (temp_word, sizeof(temp_word),vocab_file)) {
    if (strncmp(temp_word,"##",2)==0) continue;
    sscanf (temp_word, "%s ",temp_word2);

    /* Check for vocabulary order */
    if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) 
      quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n");

    /* Check for repeated words in the vocabulary */

    if (index2(&vocabulary,temp_word2) != 0) 
      warn_on_repeated_words(temp_word);

    warn_on_wrong_vocab_comments(temp_word);

    vocab_size++;
    
    add_to_idngram_hashtable(&vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size);
    strcpy(temp_word3,temp_word2);
  }

  if (vocab_size > MAX_VOCAB_SIZE) 
    quit(-1,"Error : Vocabulary size exceeds maximum.\n");
  
  pc_message(verbosity,2,"Allocating memory for the buffer...\n");

  buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec));
  
  for (i=0;i<=buffer_size;i++) 
    buffer[i].word = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Open the "non-OOV" tempfile */

  sprintf(temp_word, "%s/1%s", temp_directory, temp_file_ext);
  
  non_unk_fp = rr_fopen(temp_word,"w");

  pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n",
	     temp_word);
  number_of_tempfiles = 1;

  current_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));
  sort_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t));

  /* Read text into buffer */
  position_in_buffer = 0;

  while (!rr_feof(stdin)) {
    
    for (i=0;i<=n-1;i++) {
      get_word(stdin,temp_word);
      current_ngram[i]=index2(&vocabulary,temp_word);
    }
    if (scanf("%d",&current_count) != 1) 
      if (!rr_feof(stdin)) 
	quit(-1,"Error reading n-gram count from stdin.\n");

    if (!rr_feof(stdin)) {

      contains_unks = 0;
      for (i=0;i<=n-1;i++) {
	if (!current_ngram[i]) 
	  contains_unks = 1;
      }

      if (contains_unks) {
	/* Write to buffer */

	position_in_buffer++;

	if (position_in_buffer >= buffer_size) {

	  /* Sort buffer */
	  pc_message(verbosity,2,
		     "Sorting n-grams which include an OOV word...\n");

	  qsort((void*) buffer,(size_t) position_in_buffer,
		sizeof(ngram_rec),compare_ngrams2);

	  pc_message(verbosity,2,"Done.\n");

	  /* Write buffer to temporary file */

	  number_of_tempfiles++;
	  
	  sprintf(temp_word,"%s/%hu%s", temp_directory,
		  number_of_tempfiles,temp_file_ext);
	  
	  pc_message(verbosity,2,
		     "Writing sorted OOV-counts buffer to temporary file %s\n",
		     temp_word);

	  tempfile = rr_fopen(temp_word,"w");
	  
	  for (i=0;i<=n-1;i++) 
	    sort_ngram[i] = buffer[0].word[i];

	  sort_count = buffer[0].count;

	  for (i=0;i<=position_in_buffer-2;i++) {
	    
	    same_ngram = 1;
	    for (j=n-1;j>=0;j--) {
	      if (buffer[i].word[j] != sort_ngram[j]) {
		same_ngram = 0;
		j = -1;
	      }
	    }

	    if (same_ngram) 
	      sort_count += buffer[i].count;
	    else {
	      for (j=0;j<=n-1;j++) {
		rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
			  tempfile,"temporary n-gram ids");
		sort_ngram[j] = buffer[i].word[j];
	      }
	      rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
			"temporary n-gram counts");
	      sort_count = buffer[i].count;
	    }
	  }	    
	  for (j=0;j<=n-1;j++) 
	    rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		      tempfile,"temporary n-gram ids");

	  rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		    "temporary n-gram counts");
	  rr_oclose(tempfile);
	  position_in_buffer = 1;

	}
	
	for (i=0;i<=n-1;i++) 
	  buffer[position_in_buffer-1].word[i] = current_ngram[i];

	buffer[position_in_buffer-1].count = current_count;

      }else {
	/* Write to temporary file */
	for (i=0;i<=n-1;i++) 
	  rr_fwrite((char*)&current_ngram[i],sizeof(wordid_t),1,
		    non_unk_fp,"temporary n-gram ids");

	rr_fwrite((char*)&current_count,sizeof(int),1,non_unk_fp,
		  "temporary n-gram counts");
      }
    }
  }

  if (position_in_buffer > 0) {

    /* Only do this bit if we have actually seen some OOVs */
    /* Sort final buffer */    
    pc_message(verbosity,2,"Sorting final buffer...\n");

    qsort((void*) buffer,(size_t) position_in_buffer,
	  sizeof(ngram_rec),compare_ngrams2);
    
    /* Write final buffer */
    
    number_of_tempfiles++;
  
    sprintf(temp_word,"%s/%hu%s", temp_directory,
	    number_of_tempfiles,temp_file_ext);
    
    pc_message(verbosity,2,"Writing sorted buffer to temporary file %s\n", temp_word);

    tempfile = rr_fopen(temp_word,"w");
    
    for (i=0;i<=n-1;i++) 
      sort_ngram[i] = buffer[0].word[i];

    sort_count = buffer[0].count;
    
    for (i=1;i<=position_in_buffer-1;i++) {
      
      same_ngram = 1;
      for (j=n-1;j>=0;j--) {
	if (buffer[i].word[j] != sort_ngram[j]) {
	  same_ngram = 0;
	  j = -1;
	}
      }
      
      if (same_ngram) 
	sort_count += buffer[i].count;
      else {
	for (j=0;j<=n-1;j++) {
	  rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		    tempfile,"temporary n-gram ids");
	  sort_ngram[j] = buffer[i].word[j];
	}
	rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
		  "temporary n-gram counts");
	sort_count = buffer[i].count;
      }
    }	    
    for (j=0;j<=n-1;j++) 
      rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1,
		tempfile,"temporary n-gram ids");

    rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile,
	      "temporary n-gram counts");
    fclose(tempfile);
    

  }
  

  /* Merge the temporary files, and output the result */
  fclose(non_unk_fp);
  pc_message(verbosity,2,"Merging temporary files...\n");
  merge_idngramfiles(1,
		     number_of_tempfiles,
		     temp_directory,
		     temp_file_ext,
		     max_files,
		     outfile,
		     write_ascii,
		     fof_size,
		     n);

  fclose(outfile);

  rmdir(temp_directory);
  pc_message(verbosity,0,"wngram2idngram : Done.\n");

  return 0;
}