int read_vocab(char* vocab_filename, int verbosity, struct idngram_hash_table* vocabulary, int M ) { FILE *vocab_file; int vocab_size; char temp_word[MAX_WORD_LENGTH]; char temp_word2[MAX_WORD_LENGTH]; vocab_size = 0; vocab_file = rr_iopen(vocab_filename); pc_message(verbosity,2,"Reading vocabulary... \n"); while (fgets (temp_word, sizeof(temp_word),vocab_file)) { if (strncmp(temp_word,"##",2)==0) continue; sscanf (temp_word, "%s ",temp_word2); /* printf("hey hey %s %d\n ", temp_word2, idngram_hash(temp_word2,M));*/ /* Check for repeated words in the vocabulary */ if (index2(vocabulary,temp_word2) != 0) warn_on_repeated_words(temp_word2); warn_on_wrong_vocab_comments(temp_word); vocab_size++; /* printf("%s %d\n ", temp_word2, idngram_hash(temp_word2,M));*/ add_to_idngram_hashtable(vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size); if(vocab_size == M){ quit(-1, "Number of entries reached the size of the hash. Run the program again with a larger has size -hash \n"); } } if (vocab_size > MAX_VOCAB_SIZE) fprintf(stderr,"text2idngram : vocab_size %d\n is larger than %d\n",vocab_size,MAX_VOCAB_SIZE); return 0; }
wordid_t index2(struct idngram_hash_table *vocab, char *word) { unsigned long chain; struct idngram_node *chain_pos; chain = idngram_hash( word, vocab->size ); if ( chain >= vocab->size ) { fprintf( stderr, "WARNING : invalid hash address\n" ); fprintf( stderr, "%s ignored\n", word ); return(0); } chain_pos = vocab->chain[chain]; while (chain_pos->next != NULL) { if (strcmp(word,chain_pos->next->word) ) { fflush(stderr); chain_pos = chain_pos->next; }else return (chain_pos->next->ind); } return (0); }
int main(int argc, char *argv[]) { int verbosity; int vocab_size; FILE *vocab_file; int buffer_size; flag write_ascii; int max_files; int number_of_tempfiles; char *vocab_filename; char *idngram_filename; char temp_word[MAX_WORD_LENGTH]; char temp_word2[MAX_WORD_LENGTH]; char temp_word3[MAX_WORD_LENGTH]; flag contains_unks; int position_in_buffer; FILE *outfile; FILE *tempfile; FILE *non_unk_fp; ngram_rec *buffer; flag same_ngram; int i; int j; int fof_size; int size_of_rec; char temp_directory[1000]; char *temp_file_ext; /* Vocab hash table things */ struct idngram_hash_table vocabulary; unsigned long hash_size; unsigned long M; wordid_t *current_ngram; int current_count; wordid_t *sort_ngram; int sort_count; /* Process command line */ report_version(&argc,argv); if (argc == 1 || pc_flagarg(&argc, argv,"-help")) { /* Display help message */ help_message(); exit(1); } n = pc_intarg( &argc, argv, "-n",DEFAULT_N); hash_size = pc_intarg( &argc, argv, "-hash",DEFAULT_HASH_SIZE); buffer_size = pc_intarg( &argc, argv, "-buffer",STD_MEM); write_ascii = pc_flagarg(&argc,argv,"-write_ascii"); verbosity = pc_intarg(&argc,argv,"-verbosity",DEFAULT_VERBOSITY); max_files = pc_intarg( &argc, argv, "-files",DEFAULT_MAX_FILES); fof_size = pc_intarg(&argc,argv,"-fof_size",10); vocab_filename = salloc(pc_stringarg( &argc, argv, "-vocab", "" )); idngram_filename = salloc(pc_stringarg( &argc, argv, "-idngram", "" )); if (!strcmp("",vocab_filename)) quit(-1,"Error : Must specify a vocabulary file.\n"); if (!strcmp("",idngram_filename)) quit(-1,"text2idngram : Error : Must specify idngram file.\n"); if (pc_flagarg(&argc,argv,"-compress")) temp_file_ext = salloc(".Z"); else { if (pc_flagarg(&argc,argv,"-gzip")) temp_file_ext = salloc(".gz"); else temp_file_ext = salloc(""); } strcpy(temp_directory, "cmuclmtk-XXXXXX"); if (mkdtemp(temp_directory) == NULL) { quit(-1, "Failed to create temporary folder: %s\n", strerror(errno)); } pc_report_unk_args(&argc,argv,verbosity); outfile = rr_fopen(idngram_filename,"wb"); pc_message(verbosity,2,"Vocab : %s\n",vocab_filename); pc_message(verbosity,2,"Output idngram : %s\n",idngram_filename); pc_message(verbosity,2,"Buffer size : %d\n",buffer_size); pc_message(verbosity,2,"Hash table size : %d\n",hash_size); pc_message(verbosity,2,"Max open files : %d\n",max_files); pc_message(verbosity,2,"n : %d\n",n); pc_message(verbosity,2,"FOF size : %d\n",fof_size); size_of_rec = (sizeof(wordid_t) * n) + 16 - (( n* sizeof(wordid_t)) % 16); buffer_size *= (1000000/((sizeof(ngram_rec) + size_of_rec))); fprintf(stderr,"buffer size = %d\n",buffer_size); /* Allocate memory for hash table */ fprintf(stderr,"Initialising hash table...\n"); M = nearest_prime(hash_size); new_idngram_hashtable(&vocabulary,M); /* Read in the vocabulary */ vocab_size = 0; vocab_file = rr_iopen(vocab_filename); pc_message(verbosity,2,"Reading vocabulary...\n"); while (fgets (temp_word, sizeof(temp_word),vocab_file)) { if (strncmp(temp_word,"##",2)==0) continue; sscanf (temp_word, "%s ",temp_word2); /* Check for vocabulary order */ if (vocab_size > 0 && strcmp(temp_word2,temp_word3)<0) quit(-1,"wngram2idngram : Error : Vocabulary is not alphabetically ordered.\n"); /* Check for repeated words in the vocabulary */ if (index2(&vocabulary,temp_word2) != 0) warn_on_repeated_words(temp_word); warn_on_wrong_vocab_comments(temp_word); vocab_size++; add_to_idngram_hashtable(&vocabulary,idngram_hash(temp_word2,M),temp_word2,vocab_size); strcpy(temp_word3,temp_word2); } if (vocab_size > MAX_VOCAB_SIZE) quit(-1,"Error : Vocabulary size exceeds maximum.\n"); pc_message(verbosity,2,"Allocating memory for the buffer...\n"); buffer=(ngram_rec *) rr_malloc((buffer_size+1)*sizeof(ngram_rec)); for (i=0;i<=buffer_size;i++) buffer[i].word = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); /* Open the "non-OOV" tempfile */ sprintf(temp_word, "%s/1%s", temp_directory, temp_file_ext); non_unk_fp = rr_fopen(temp_word,"w"); pc_message(verbosity,2,"Writing non-OOV counts to temporary file %s\n", temp_word); number_of_tempfiles = 1; current_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); sort_ngram = (wordid_t *) rr_malloc(n*sizeof(wordid_t)); /* Read text into buffer */ position_in_buffer = 0; while (!rr_feof(stdin)) { for (i=0;i<=n-1;i++) { get_word(stdin,temp_word); current_ngram[i]=index2(&vocabulary,temp_word); } if (scanf("%d",¤t_count) != 1) if (!rr_feof(stdin)) quit(-1,"Error reading n-gram count from stdin.\n"); if (!rr_feof(stdin)) { contains_unks = 0; for (i=0;i<=n-1;i++) { if (!current_ngram[i]) contains_unks = 1; } if (contains_unks) { /* Write to buffer */ position_in_buffer++; if (position_in_buffer >= buffer_size) { /* Sort buffer */ pc_message(verbosity,2, "Sorting n-grams which include an OOV word...\n"); qsort((void*) buffer,(size_t) position_in_buffer, sizeof(ngram_rec),compare_ngrams2); pc_message(verbosity,2,"Done.\n"); /* Write buffer to temporary file */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s", temp_directory, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2, "Writing sorted OOV-counts buffer to temporary file %s\n", temp_word); tempfile = rr_fopen(temp_word,"w"); for (i=0;i<=n-1;i++) sort_ngram[i] = buffer[0].word[i]; sort_count = buffer[0].count; for (i=0;i<=position_in_buffer-2;i++) { same_ngram = 1; for (j=n-1;j>=0;j--) { if (buffer[i].word[j] != sort_ngram[j]) { same_ngram = 0; j = -1; } } if (same_ngram) sort_count += buffer[i].count; else { for (j=0;j<=n-1;j++) { rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); sort_ngram[j] = buffer[i].word[j]; } rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); sort_count = buffer[i].count; } } for (j=0;j<=n-1;j++) rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); rr_oclose(tempfile); position_in_buffer = 1; } for (i=0;i<=n-1;i++) buffer[position_in_buffer-1].word[i] = current_ngram[i]; buffer[position_in_buffer-1].count = current_count; }else { /* Write to temporary file */ for (i=0;i<=n-1;i++) rr_fwrite((char*)¤t_ngram[i],sizeof(wordid_t),1, non_unk_fp,"temporary n-gram ids"); rr_fwrite((char*)¤t_count,sizeof(int),1,non_unk_fp, "temporary n-gram counts"); } } } if (position_in_buffer > 0) { /* Only do this bit if we have actually seen some OOVs */ /* Sort final buffer */ pc_message(verbosity,2,"Sorting final buffer...\n"); qsort((void*) buffer,(size_t) position_in_buffer, sizeof(ngram_rec),compare_ngrams2); /* Write final buffer */ number_of_tempfiles++; sprintf(temp_word,"%s/%hu%s", temp_directory, number_of_tempfiles,temp_file_ext); pc_message(verbosity,2,"Writing sorted buffer to temporary file %s\n", temp_word); tempfile = rr_fopen(temp_word,"w"); for (i=0;i<=n-1;i++) sort_ngram[i] = buffer[0].word[i]; sort_count = buffer[0].count; for (i=1;i<=position_in_buffer-1;i++) { same_ngram = 1; for (j=n-1;j>=0;j--) { if (buffer[i].word[j] != sort_ngram[j]) { same_ngram = 0; j = -1; } } if (same_ngram) sort_count += buffer[i].count; else { for (j=0;j<=n-1;j++) { rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); sort_ngram[j] = buffer[i].word[j]; } rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); sort_count = buffer[i].count; } } for (j=0;j<=n-1;j++) rr_fwrite((char*)&sort_ngram[j],sizeof(wordid_t),1, tempfile,"temporary n-gram ids"); rr_fwrite((char*)&sort_count,sizeof(int),1,tempfile, "temporary n-gram counts"); fclose(tempfile); } /* Merge the temporary files, and output the result */ fclose(non_unk_fp); pc_message(verbosity,2,"Merging temporary files...\n"); merge_idngramfiles(1, number_of_tempfiles, temp_directory, temp_file_ext, max_files, outfile, write_ascii, fof_size, n); fclose(outfile); rmdir(temp_directory); pc_message(verbosity,0,"wngram2idngram : Done.\n"); return 0; }