// Sorts the vocabulary by frequency using word counts void SortVocab() { int a, size; unsigned int hash; // Sort the vocabulary and keep </s> at the first position qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; size = vocab_size; train_words = 0; for (a = 0; a < size; a++) { // Words occuring less than min_count times will be discarded from the vocab if (vocab[a].cn < min_count) { vocab_size--; free(vocab[vocab_size].word); } else { // Hash will be re-computed, as after the sorting it is not actual hash=GetWordHash(vocab[a].word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; vocab_hash[hash] = a; train_words += vocab[a].cn; } } vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); // Allocate memory for the binary tree construction for (a = 0; a < vocab_size; a++) { vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); } }
//根据单词词频排序 void SortVocab() { int a, size; unsigned int hash; // 排序 // 并且保证</s>在第一位 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);//词汇表快排 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;//词汇重排了,哈希记录的index也乱了,所有的hash记录清除,下面会重建 size = vocab_size; train_words = 0;// 用于训练的词汇总数(词频累加) for (a = 0; a < size; a++) { // 删除特别低频的词 if (vocab[a].cn < min_count) { vocab_size--; free(vocab[vocab_size].word); } else { //原来的hash失效需要重新计算 hash=GetWordHash(vocab[a].word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; vocab_hash[hash] = a; train_words += vocab[a].cn; } } vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word)); // 给霍夫曼编码和路径的词汇表索引分配空间 for (a = 0; a < vocab_size; a++) { vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char)); vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int)); } }
void ReduceVocab() { // reduces the vocabulary by removing infrequent tokens. int a, b = 0; unsigned int hash; // 最后剩下b个词,词频均大于min_reduce for (a = 0; a < vocab_size; a++) { if (vocab[a].cn > min_reduce) { vocab[b].cn = vocab[a].cn; vocab[b].word = vocab[a].word; b++; } else { free (vocab[a].word); } } vocab_size = b; // 重新分配hash索引 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; for (a = 0; a < vocab_size; a++) { hash = GetWordHash(vocab[a].word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; vocab_hash[hash] = a; } fflush(stdout); min_reduce++; }
// Reduces the vocabulary by removing infrequent tokens void ReduceVocab () { int a, b = 0; unsigned int hash; for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) { vocab[b].cn = vocab[a].cn; vocab[b].word = vocab[a].word; b++; } else free (vocab[a].word); vocab_size = b; for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; for (a = 0; a < vocab_size; a++) { // Hash will be re-computed, as it is not actual hash = GetWordHash (vocab[a].word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; vocab_hash[hash] = a; } fflush (stdout); min_reduce++; }
// Sorts the vocabulary by frequency using word counts void SortVocab () { int a; unsigned int hash; // Sort the vocabulary and keep </s> at the first position qsort (&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; for (a = 0; a < vocab_size; a++) { // Words occuring less than min_count times will be discarded from the vocab if (vocab[a].cn < min_count) { vocab_size--; free (vocab[vocab_size].word); } else { // Hash will be re-computed, as after the sorting it is not actual hash = GetWordHash (vocab[a].word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; vocab_hash[hash] = a; } } vocab = (struct vocab_word *) realloc (vocab, vocab_size * sizeof(struct vocab_word)); }
// Returns position of a word in the vocabulary; if the word is not found, returns -1 int SearchVocab(char *word) { unsigned int hash = GetWordHash(word); while (1) { if (vocab_hash[hash] == -1) return -1; if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash]; hash = (hash + 1) % vocab_hash_size; } return -1; }
// Returns position of a word in the vocabulary; if the word is not found, returns -1 int SearchVocab(struct vocabulary *v, char *word) { unsigned int hash = GetWordHash(v, word); while (1) { if ((v->vocab_hash)[hash] == -1) return -1; if (!strcmp(word, v->vocab[v->vocab_hash[hash]].word)) return v->vocab_hash[hash]; hash = (hash + 1) % vocab_hash_size; } return -1; }
int AddWordToVocab(char * word){ unsigned int hash, lengh = strlen(word) + 1; if (lengh > MAX_STRING) lengh = MAX_STRING; vocab[vocab_size].word = (char *) calloc(lengh, sizeof(char)); strcpy(vocab[vocab_size].word, word); vocab[vocab_size].cn = 0; vocab_size++; if (vocab_size + 2 >= vocab_max_size ) { vocab_max_size += 1000;// 每次增加1000个词位 vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); } hash = GetWordHash(word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; // 线性探索hash vocab_hash[hash] = vocab_size - 1;// 记录在词汇表中的存储位置 return vocab_size - 1;// 返回添加的单词在词汇表中的存储位置 }
// Adds a word to the vocabulary int AddWordToVocab(char *word) { unsigned int hash, length = strlen(word) + 1; if (length > MAX_STRING) length = MAX_STRING; vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); strcpy(vocab[vocab_size].word, word); vocab[vocab_size].cn = 0; vocab_size++; // Reallocate memory if needed if (vocab_size + 2 >= vocab_max_size) { vocab_max_size *= 1.5; // was += 1000, modified to have fewer reallocations vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); } hash = GetWordHash(word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; vocab_hash[hash] = vocab_size - 1; return vocab_size - 1; }
// Adds a word to the vocabulary // 将词添加到词汇表 int AddWordToVocab(char *word) { unsigned int hash, length = strlen(word) + 1; if (length > MAX_STRING) length = MAX_STRING; //词的长度不能超MAX_STRING vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); strcpy(vocab[vocab_size].word, word); vocab[vocab_size].cn = 0; //初始词频为0 vocab_size++; // Reallocate memory if needed if (vocab_size + 2 >= vocab_max_size) { vocab_max_size += 1000; vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); } hash = GetWordHash(word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; //如果hash值冲突,采用线性探测的开放定址法,顺序向下查找 vocab_hash[hash] = vocab_size - 1; return vocab_size - 1; }
// Adds a word to the vocabulary int AddWordToVocab(char *word) { unsigned int hash, length = strlen(word) + 1; //加1是为了存储末尾的结束符 if (length > MAX_STRING) length = MAX_STRING; vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); strcpy(vocab[vocab_size].word, word); vocab[vocab_size].cn = 0; vocab_size++; //词汇表大小 // Reallocate memory if needed if (vocab_size + 2 >= vocab_max_size) { vocab_max_size += 1000; vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));//realloc把vocab所在的内存块重新分配一块堆内存,之前的内存被释放, } hash = GetWordHash(word); //word的hash值 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; //不为1表示出现冲突,采用线性探测开放定址法,顺序向下查找未被占用的位置 vocab_hash[hash] = vocab_size - 1; //hash表中记录word在词汇表中的下标 return vocab_size - 1; }
// Adds a word to the vocabulary 将一个词添加到一个词汇中 int AddWordToVocab(char *word) { unsigned int hash, length = strlen(word) + 1; if (length > MAX_STRING) length = MAX_STRING; vocab[vocab_size].word = (char *)calloc(length, sizeof(char)); strcpy(vocab[vocab_size].word, word); vocab[vocab_size].cn = 0; vocab_size++; // Reallocate memory if needed if (vocab_size + 2 >= vocab_max_size) { vocab_max_size += 1000; vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word)); } hash = GetWordHash(word); while (vocab_hash[hash] != -1)//如果hash值冲突了 hash = (hash + 1) % vocab_hash_size;//使用开放地址法解决冲突 vocab_hash[hash] = vocab_size - 1;//由词的hash值找到她所在词汇表的排序位置 return vocab_size - 1; }
// Adds a word to the vocabulary int AddWordToVocab(struct vocabulary *v, char *word) { //static long collide = 0; //static long nocollide = 0; unsigned int hash, length = strlen(word) + 1; if (length > MAX_STRING) length = MAX_STRING; v->vocab[v->vocab_size].word = (char *)calloc(length, sizeof(char)); strcpy(v->vocab[v->vocab_size].word, word); v->vocab[v->vocab_size].cn = 0; v->vocab_size++; // Reallocate memory if needed if (v->vocab_size + 2 >= v->vocab_max_size) { v->vocab_max_size += 1000; v->vocab = (struct vocab_word *)realloc(v->vocab, v->vocab_max_size * sizeof(struct vocab_word)); } hash = GetWordHash(v, word); //if (v->vocab_hash[hash] != -1) { collide += 1; } else { nocollide += 1; } //if ((collide + nocollide) % 100000 == 0) printf("%d %d %f collisions\n\n",collide, nocollide, (float)collide/(collide+nocollide)); while (v->vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; v->vocab_hash[hash] = v->vocab_size - 1; return v->vocab_size - 1; }
// Reduces the vocabulary by removing infrequent tokens void ReduceVocab(struct vocabulary *v) { static int min_reduce = 1; printf("reducevocab\n"); int a, b = 0; unsigned int hash; for (a = 0; a < v->vocab_size; a++) if (v->vocab[a].cn > min_reduce) { v->vocab[b].cn = v->vocab[a].cn; v->vocab[b].word = v->vocab[a].word; b++; } else free(v->vocab[a].word); v->vocab_size = b; for (a = 0; a < vocab_hash_size; a++) v->vocab_hash[a] = -1; for (a = 0; a < v->vocab_size; a++) { // Hash will be re-computed, as it is not actual hash = GetWordHash(v, v->vocab[a].word); while (v->vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; v->vocab_hash[hash] = a; } fflush(stdout); min_reduce++; }
// Sorts the vocabulary by frequency using word counts void SortAndReduceVocab(struct vocabulary *v, int min_count) { int a, size; unsigned int hash; // Sort the vocabulary and keep </s> at the first position qsort(&(v->vocab[1]), v->vocab_size - 1, sizeof(struct vocab_word), VocabCompare); for (a = 0; a < vocab_hash_size; a++) v->vocab_hash[a] = -1; size = v->vocab_size; v->word_count = 0; for (a = 0; a < size; a++) { // Words occuring less than min_count times will be discarded from the vocab if (v->vocab[a].cn < min_count) { v->vocab_size--; free(v->vocab[v->vocab_size].word); } else { // Hash will be re-computed, as after the sorting it is not actual hash=GetWordHash(v, v->vocab[a].word); while (v->vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; v->vocab_hash[hash] = a; v->word_count += v->vocab[a].cn; } } v->vocab = (struct vocab_word *)realloc(v->vocab, (v->vocab_size + 1) * sizeof(struct vocab_word)); }
// Initializing the network and read the embeddings void initializeEmbeddings(char* embedPath){ FILE* filePt = fopen(embedPath, "rb"); long long i, j, offset; long dims; float value; char word[MAX_STRING]; unsigned int hash, length; //------------------------------------------------------------ // Read vocab size and number of hidden layers if (!fscanf(filePt, "%lld %ld\n", &vocab_size, &dims)){ printf("Error reading the embed file!"); exit(1); } if(layer1_size != dims){ printf("Number of dimensions not consistent with embedding file!\n"); exit(1); } //------------------------------------------------------------ // Allocate memory for the intput-to-hidden parameters printf("(vocab size, hidden dims): %lld %lld\n", vocab_size, layer1_size); int flag = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(float)); if (syn0 == NULL || flag){ printf("Memory allocation failed\n"); exit(1); } //------------------------------------------------------------ // Allocating memory for reading vocab, initialize hash vocab = (struct vocab_word*) malloc(vocab_size * sizeof(struct vocab_word)); vocab_hash = (int*) malloc(sizeof(int) * vocab_hash_size); for (i = 0; i < vocab_hash_size; i++) vocab_hash[i] = -1; // Initializing with random values //unsigned long long next_random = 1; // Reading the words and feature and store them sequentially for (i = 0; i < vocab_size; i++){ // Store the word if (!fscanf(filePt, "%s", word)){ printf("Error reading the embed file!"); exit(1); } //printf("%lld, %lld, %s\n", i, vocab_size, word); length = strlen(word) + 1; // Truncate if needed if (length > MAX_STRING) length = MAX_STRING; vocab[i].word = (char*) calloc(length, sizeof(char)); strcpy(vocab[i].word, word); vocab[i].cn = 0; hash = GetWordHash(word); while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size; vocab_hash[hash] = i; // Store feature offset = layer1_size * i; for (j = 0; j < layer1_size; j++){ if (!fscanf(filePt, "%f", &value)){ printf("Error reading the embed file!"); exit(1); } // Initializing with random //next_random = next_random * (unsigned long long)25214903917 + 11; //syn0[offset + j] = (((next_random & 0xffff) / (real)65536) - 0.5) / layer1_size; // Storing the value syn0[offset + j] = value; } } // Close the file and exit fclose(filePt); printf("Done reading and initializing embeddings...\n"); }