void LearnVocabFromTrainFile() { char word[MAX_STRING]; FILE *fin; long long a, i; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if (feof(fin)) break; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { fprintf(stderr, "%lldK%c", train_words / 1000, 13); } i = SearchVocab(word); if (i == -1) { a = AddWordToVocab(word); vocab[a].cn = 1; } else vocab[i].cn++; if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); } SortVocab(); if (debug_mode > 0) { fprintf(stderr, "Vocab size: %lld\n", vocab_size); fprintf(stderr, "Words in train file: %lld\n", train_words); } file_size = ftell(fin); fclose(fin); }
void ReadVocab() { long long a, i = 0; char c; char word[MAX_STRING]; FILE *fin = fopen(read_vocab_file, "rb"); if (fin == NULL) { printf("Vocabulary file not found\n"); exit(1); } for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; vocab_size = 0; while (1) { ReadWord(word, fin); if (feof(fin)) break; a = AddWordToVocab(word); fscanf(fin, "%lld%c", &vocab[a].cn, &c); i++; } SortVocab(); if (debug_mode > 0) { printf("Vocab size: %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } fseek(fin, 0, SEEK_END); file_size = ftell(fin); fclose(fin); }
void ReadVocab() { long long a, i = 0; char c; char word[MAX_STRING]; FILE *fin = fopen(model_file, "rb"); if (fin == NULL) { fprintf(stderr, "Vocabulary file not found\n"); exit(1); } for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; vocab_size = 0; while (1) { ReadWord(word, fin); if (feof(fin)) break; a = AddWordToVocab(word); fscanf(fin, "%lld%c", &vocab[a].cn, &c); i++; } SortVocab(); if(recompute_train_counts) { // If training file changed, e.g. in fine-tuning FILE *fi = fopen(train_file, "rb"); if (fi == NULL) { fprintf(stderr, "ERROR: training data file not found!\n"); exit(1); } train_words = 0; while (1) { ReadWordIndex(fi); ++train_words; if (feof(fi)) break; } fclose(fi); } if (debug_mode > 0) { fprintf(stderr, "Vocab size: %lld\n", vocab_size); fprintf(stderr, "Words in train file: %lld\n", train_words); } if(test_file[0] != 0 || gen != 0) return; fin = fopen(train_file, "rb"); if (fin == NULL) { fprintf(stderr, "ERROR: training data file not found!\n"); exit(1); } fseek(fin, 0, SEEK_END); file_size = ftell(fin); fclose(fin); }
void LearnVocabFromTrainFile() { char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; FILE *fin; long long a, i, start = 1; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if (feof(fin)) break; if (!strcmp(word, "</s>")) { start = 1; continue; } else start = 0; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13); fflush(stdout); } i = SearchVocab(word); if (i == -1) { a = AddWordToVocab(word); vocab[a].cn = 1; } else vocab[i].cn++; if (start) continue; sprintf(bigram_word, "%s_%s", last_word, word); bigram_word[MAX_STRING - 1] = 0; strcpy(last_word, word); i = SearchVocab(bigram_word); if (i == -1) { a = AddWordToVocab(bigram_word); vocab[a].cn = 1; } else vocab[i].cn++; if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); } SortVocab(); if (debug_mode > 0) { printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } fclose(fin); }
void LearnVocabFromTrainFile() { char word[MAX_STRING]; FILE *fin; long long a, i; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if(strlen(word)>1)if(word[0]=='_'&&word[1]=='*')continue; if (feof(fin)) break; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { printf("%lldK%c", train_words / 1000, 13); fflush(stdout); } i = SearchVocab(word); if (i == -1) { a = AddWordToVocab(word); vocab[a].cn = 1; } else vocab[i].cn++; if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); } SortVocab(); if (debug_mode > 0) { printf("Vocab size: %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } long long freq_count=0; //for(a=0;a<vocab_size;a++){ // if(vocab[a].cn > 50000){freq_count+=vocab[a].cn;printf("%s\n",vocab[a].word);} //} //printf("freq_count: %lld\n", freq_count); file_size = ftell(fin); fclose(fin); }