void ReadVocab() { long long a, i = 0; char c; char word[MAX_STRING]; FILE *fin = fopen(read_vocab_file, "rb"); if (fin == NULL) { printf("Vocabulary file not found\n"); exit(1); } for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; vocab_size = 0; while (1) { ReadWord(word, fin); if (feof(fin)) break; a = AddWordToVocab(word); fscanf(fin, "%lld%c", &vocab[a].cn, &c); i++; } SortVocab(); if (debug_mode > 0) { printf("Vocab size: %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } fseek(fin, 0, SEEK_END); file_size = ftell(fin); fclose(fin); }
// Sorts the vocabulary by frequency using word counts void SortVocab() { int a, size; unsigned int hash; // Sort the vocabulary and keep </s> at the first position qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; size = vocab_size; train_words = 0; for (a = 0; a < size; a++) { // Words occuring less than min_count times will be discarded from the vocab if (vocab[a].cn < min_count && !(StartsWith((char *)"_*", vocab[a].word) && sentence_vectors)) { vocab_size--; free(vocab[vocab_size].word); } else { // Hash will be re-computed, as after the sorting it is not actual hash=GetWordHash(vocab[a].word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; vocab_hash[hash] = a; train_words += vocab[a].cn; } } a = AddWordToVocab((char *)"<UNKNOWN>"); vocab[a].cn = 1; //vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); // Allocate memory for the binary tree construction for (a = 0; a < vocab_size; a++) { vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); } }
void LearnVocabFromTrainFile() { char word[MAX_STRING], last_word[MAX_STRING], bigram_word[MAX_STRING * 2]; FILE *fin; long long a, i, start = 1; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if (feof(fin)) break; if (!strcmp(word, "</s>")) { start = 1; continue; } else start = 0; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { printf("Words processed: %lldK Vocab size: %lldK %c", train_words / 1000, vocab_size / 1000, 13); fflush(stdout); } i = SearchVocab(word); if (i == -1) { a = AddWordToVocab(word); vocab[a].cn = 1; } else vocab[i].cn++; if (start) continue; sprintf(bigram_word, "%s_%s", last_word, word); bigram_word[MAX_STRING - 1] = 0; strcpy(last_word, word); i = SearchVocab(bigram_word); if (i == -1) { a = AddWordToVocab(bigram_word); vocab[a].cn = 1; } else vocab[i].cn++; if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); } SortVocab(); if (debug_mode > 0) { printf("\nVocab size (unigrams + bigrams): %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } fclose(fin); }
//从分词文件中统计每个单词的词频 void LearnVocabFromTrainFile() { char word[MAX_STRING]; FILE *fin; long long a, i; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if (feof(fin)) break; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { printf("%lldK%c", train_words / 1000, 13); fflush(stdout); } i = SearchVocab(word);//返回该词在词汇表中的位置 if (i == -1)//该词之前不存在 { a = AddWordToVocab(word);//把该词添加到词汇表中 vocab[a].cn = 1; } else vocab[i].cn++;//更新词频 if (vocab_size > vocab_hash_size * 0.7)//如果词汇表太庞大,就缩减 ReduceVocab(); } SortVocab();//根据词频排序词汇表 if (debug_mode > 0) { printf("Vocab size: %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } file_size = ftell(fin); fclose(fin); }
void LearnVocabFromTrainFile() { char word[MAX_STRING]; FILE *fin; long long a, i; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if(strlen(word)>1)if(word[0]=='_'&&word[1]=='*')continue; if (feof(fin)) break; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { printf("%lldK%c", train_words / 1000, 13); fflush(stdout); } i = SearchVocab(word); if (i == -1) { a = AddWordToVocab(word); vocab[a].cn = 1; } else vocab[i].cn++; if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); } SortVocab(); if (debug_mode > 0) { printf("Vocab size: %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } long long freq_count=0; //for(a=0;a<vocab_size;a++){ // if(vocab[a].cn > 50000){freq_count+=vocab[a].cn;printf("%s\n",vocab[a].word);} //} //printf("freq_count: %lld\n", freq_count); file_size = ftell(fin); fclose(fin); }
void LearnVocabFromTrainFile() { char word[MAX_STRING]; FILE *fin; long long a, i; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if (feof(fin)) break; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { printf("%lldK%c", train_words / 1000, 13); //13 for k; fflush(stdout); } //if word is not in, add it; or increase the times; i = SearchVocab(word); if (i == -1) { a = AddWordToVocab(word); vocab[a].cn = 1; } else vocab[i].cn++; if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); } SortVocab(); if (debug_mode > 0) { printf("Vocab size: %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } file_size = ftell(fin); fclose(fin); }
void ReadVocab() { long long a, i = 0; char c; char word[MAX_STRING]; FILE *fin = fopen(model_file, "rb"); if (fin == NULL) { fprintf(stderr, "Vocabulary file not found\n"); exit(1); } for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; vocab_size = 0; while (1) { ReadWord(word, fin); if (feof(fin)) break; a = AddWordToVocab(word); fscanf(fin, "%lld%c", &vocab[a].cn, &c); i++; } SortVocab(); if(recompute_train_counts) { // If training file changed, e.g. in fine-tuning FILE *fi = fopen(train_file, "rb"); if (fi == NULL) { fprintf(stderr, "ERROR: training data file not found!\n"); exit(1); } train_words = 0; while (1) { ReadWordIndex(fi); ++train_words; if (feof(fi)) break; } fclose(fi); } if (debug_mode > 0) { fprintf(stderr, "Vocab size: %lld\n", vocab_size); fprintf(stderr, "Words in train file: %lld\n", train_words); } if(test_file[0] != 0 || gen != 0) return; fin = fopen(train_file, "rb"); if (fin == NULL) { fprintf(stderr, "ERROR: training data file not found!\n"); exit(1); } fseek(fin, 0, SEEK_END); file_size = ftell(fin); fclose(fin); }
// 装载训练文件到词汇表数据结构 void LearnVocabFromTrainFile() { char word[MAX_STRING]; FILE * fin; long long a, i; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; fin = fopen(train_file, "rb"); if (fin == NULL) { printf("ERROR: training data file not found!\n"); exit(1); } vocab_size = 0; // 首先添加的是回车 AddWordToVocab((char *)"</s>"); while (1) { ReadWord(word, fin); if (feof(fin)) break; train_words++; if ((debug_mode > 1) && (train_words % 100000 == 0)) { printf("%lldK%c", train_words / 1000, 13); fflush(stdout); } i = SearchVocab(word); if (i == -1) {// 如果这个单词不存在,我们将其加入hash表 a = AddWordToVocab(word); vocab[a].cn = 1; } else vocab[i].cn++;// 否则词频加一 // 如果超出装填系数,将词汇表扩容 if (vocab_size > vocab_hash_size * 0.7) ReduceVocab(); } SortVocab(); if (debug_mode > 0) { printf("Vocab size: %lld\n", vocab_size); printf("Words in train file: %lld\n", train_words); } file_size = ftell(fin);//文件大小 fclose(fin); }
struct vocabulary *ReadVocab(char *vocabfile) { long long a, i = 0; char c; char word[MAX_STRING]; FILE *fin = fopen(vocabfile, "rb"); if (fin == NULL) { printf("Vocabulary file not found\n"); exit(1); } struct vocabulary *v = CreateVocabulary(); while (1) { ReadWord(word, fin, MAX_STRING); if (feof(fin)) break; a = AddWordToVocab(v, word); fscanf(fin, "%lld%c", &v->vocab[a].cn, &c); i++; } SortAndReduceVocab(v, 0); printf("Vocab size: %ld\n", v->vocab_size); printf("Word count: %lld\n", v->word_count); return v; }