void ReadVocab() { long long a, i = 0; char c; char word[MAX_STRING]; FILE *fin = fopen(model_file, "rb"); if (fin == NULL) { fprintf(stderr, "Vocabulary file not found\n"); exit(1); } for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1; vocab_size = 0; while (1) { ReadWord(word, fin); if (feof(fin)) break; a = AddWordToVocab(word); fscanf(fin, "%lld%c", &vocab[a].cn, &c); i++; } SortVocab(); if(recompute_train_counts) { // If training file changed, e.g. in fine-tuning FILE *fi = fopen(train_file, "rb"); if (fi == NULL) { fprintf(stderr, "ERROR: training data file not found!\n"); exit(1); } train_words = 0; while (1) { ReadWordIndex(fi); ++train_words; if (feof(fi)) break; } fclose(fi); } if (debug_mode > 0) { fprintf(stderr, "Vocab size: %lld\n", vocab_size); fprintf(stderr, "Words in train file: %lld\n", train_words); } if(test_file[0] != 0 || gen != 0) return; fin = fopen(train_file, "rb"); if (fin == NULL) { fprintf(stderr, "ERROR: training data file not found!\n"); exit(1); } fseek(fin, 0, SEEK_END); file_size = ftell(fin); fclose(fin); }
/* Converts a line in a file from text to an array of vocabulary indices, the input form for forward propagation. */ long long * FileToSen(int length, FILE* fi) { long long *sen = (long long *) calloc(MAX_SENTENCE_LENGTH,sizeof(long long)); long long word = 0,sentence_length = 0; if (sentence_length == 0) { while (1) { word = ReadWordIndex(fi); if (feof(fi)) break; if (word == -1) continue; if (word == 0) break; sen[sentence_length] = word; sentence_length++; if (sentence_length >= length) break; } return sen; } }
void TrainModel() { long a; pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t)); if (model_file[0] == 0) return; int iter = 0; FILE *t1 = fopen(model_file, "rb"); FILE *t2 = fopen(model_file_nnet, "rb"); if(t1 != NULL && t2 != NULL) { fclose(t1); fclose(t2); fprintf(stderr, "Restoring nnet from existing files %s, %s\n", model_file, model_file_nnet); LoadNnet(); } else { LearnVocabFromTrainFile(); if(maxent_hash_size) { maxent_hash_size *= 1000000; maxent_hash_size -= maxent_hash_size % vocab_size; } InitNet(); SaveNnet(); } if(test_file[0] != 0) { counter = 0; real sumlogprob = EvaluateModel(test_file, 1); fprintf(stderr, "Test entropy %f\n", sumlogprob/log10(2)/(real)counter); return; } if(gen > 0) { Sample(gen, 0); return; } else if(gen < 0) { while(1) { Sample(-gen, 1); } return; } fprintf(stderr, "Starting training using file %s\n", train_file); FILE *fi = fopen(valid_file, "rb"); valid_words = 0; while (1) { ReadWordIndex(fi); ++valid_words; if (feof(fi)) break; } valid_file_size = ftell(fi); fclose(fi); real old_entropy = 1e99; real entropy; real diff = 1e99; int retry = 0; int decay = 0; while(retry < max_retry) { if(iter != 0) { if(decay) { alpha /= 2.0; maxent_alpha /= 2.0; } word_count_actual = 0; counter = 0; start = clock(); for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a); for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL); } fprintf(stderr, "Iteration %d\t", iter); sumlogprob_valid = 0; counter = 0; sumlogprob_valid = EvaluateModel(valid_file, 0); entropy = sumlogprob_valid/log10(2)/(real)counter; fprintf(stderr, "Valid Entropy %f", entropy); ++iter; diff = old_entropy/entropy; if (isnan(entropy) || isinf(entropy) || diff < stop) { if (decay == 1) { ++retry; fprintf(stderr, "\tRetry %d/%d", retry, max_retry); } else { decay = 1; fprintf(stderr, "\tDecay started"); } if(isnan(entropy) || isinf(entropy) || diff < reject_threshold) { fprintf(stderr, "\tNnet rejected"); FreeNnet(); int debug_ = debug_mode; debug_mode = 0; LoadNnet(); debug_mode = debug_; } } fprintf(stderr, "\n"); if(diff > 1.0) { SaveNnet(); old_entropy = entropy; } } }
void Sample(int num_sentences, int interactive) { long long last_word; long long sen[MAX_SENTENCE_LENGTH + 1]; long long l2; real f; real *neu1; int begin = 0; posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); sen[0] = 0; if(interactive) { printf("Enter the phrase to be continued:\n"); while(1) { int word = ReadWordIndex(stdin); if(word == 0) break; if(word == -1) word = SearchVocab("<unk>"); ++begin; sen[begin] = word; } } int sentence = 0; while (sentence < num_sentences) { memset(neu1, 0, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); // clean activations for(int i = 1; i <= begin; ++i) printf("%s ", vocab[sen[i]].word); if(begin) printf("| "); int input = 0; real logprob = 0.0; while(1) { if (input != 0) { for(int c = 0; c < layer1_size; ++c) { for(int d = 0; d < layer1_size; ++d) { neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d]; } // Recurrent hidden->hidden activation } } last_word = sen[input]; for(int c = 0; c < layer1_size; ++c) { neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden } ApplySigmoid(neu1+layer1_size*input, layer1_size); if(input < begin) { ++input; continue; } long long feature_hashes[MAX_NGRAM_ORDER] = {0}; if(maxent_order) { for(int order = 0; order < maxent_order && input >= order; ++order) { feature_hashes[order] = PRIMES[0]*PRIMES[1]; for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[input-b]+1); feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size); } } int node = vocab_size - 2; while(node > 0) { // Propagate hidden -> output f = 0.0; l2 = node * layer1_size; for(int c = 0; c < layer1_size; ++c) { f += neu1[input*layer1_size + c] * nnet.syn1[l2 + c]; } for(int order = 0; order < maxent_order && input >= order; ++order) { f += nnet.synMaxent[feature_hashes[order] + node]; } f = exp(f)/(1+exp(f)); // sigmoid real random = rand() / (real)RAND_MAX; if (f > random) { node = tree[node].child0; logprob += log10(f); } else { node = tree[node].child1; logprob += log10(1-f); } } ++input; sen[input] = node + vocab_size; printf("%s ", vocab[sen[input]].word); if(sen[input] == 0 || input == MAX_SENTENCE_LENGTH) { printf("%f %f\n", logprob, logprob /(input-begin)); break; } } ++sentence; } free(neu1); }
real EvaluateModel(char* filename, int printLoglikes) { long long d, word = -1, last_word, sentence_length = 0; long long sen[MAX_SENTENCE_LENGTH + 1]; long long l2; real f; real *neu1; posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); memset(neu1, 0, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); FILE *fi = fopen(filename, "rb"); real my_sumlogprob = 0; while (1) { if (feof(fi)) break; sen[0] = 0; int good = 1; sentence_length = 1; while(sentence_length < MAX_SENTENCE_LENGTH) { word = ReadWordIndex(fi); sen[sentence_length] = word; if (feof(fi) || word == 0) break; if( word == -1) good = 0; ++sentence_length; } if(good == 0) { if(printLoglikes) printf("OOV\n"); continue; } if(sentence_length == 1 && feof(fi)) break; real sentence_logprob = 0.0; memset(neu1, 0, (long long)layer1_size * sentence_length * sizeof(real)); for(int input = 0; input < sentence_length; ++input) { // Forward pass (not including final softmax) if (input != 0) { for(int c = 0; c < layer1_size; ++c) { for(int d = 0; d < layer1_size; ++d) { neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d]; } // Recurrent hidden->hidden activation } } last_word = sen[input]; for(int c = 0; c < layer1_size; ++c) { neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden } ApplySigmoid(neu1+layer1_size*input, layer1_size); } for(int target = 1; target <= sentence_length; ++target) { // Forward pass (softmax) word = sen[target]; long long feature_hashes[MAX_NGRAM_ORDER] = {0}; if(maxent_order) { for(int order = 0; order < maxent_order && target - order >= 0; ++order) { feature_hashes[order] = PRIMES[0]*PRIMES[1]; for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[target-b]+1); feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size); } } real logprob = 0.0; for (d = 0; d < vocab[word].codelen; d++) { // Propagate hidden -> output f = 0.0; l2 = vocab[word].point[d] * layer1_size; for(int c = 0; c < layer1_size; ++c) { f += neu1[layer1_size*(target - 1) + c] * nnet.syn1[l2 + c]; } for(int order = 0; order < maxent_order && target - order >= 0; ++order) { f += nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]]; } logprob += log10(1+(vocab[word].code[d] == 1 ? exp(f) : exp(-f))); } sentence_logprob += logprob; ++counter; } if(printLoglikes) printf("%f\n", -sentence_logprob); my_sumlogprob += sentence_logprob; } fclose(fi); free(neu1); return my_sumlogprob; }
void *TrainModelThread(void *id) { long long d, word = -1, last_word, sentence_length = 0; long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; long long l2; real f, g; clock_t now; real *neu1, *neu1e; int full_block = bptt_block + bptt; posix_memalign((void **)&neu1, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); posix_memalign((void **)&neu1e, 128, (long long)layer1_size * MAX_SENTENCE_LENGTH * sizeof(real)); FILE *fi = fopen(train_file, "rb"); fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); #ifdef DEBUG real my_sumlogprob = 0; #endif if((long long)id != 0) while(word != 0 && !feof(fi)) { // skipping to the next newline word = ReadWordIndex(fi); } while (1) { if (word_count - last_word_count > 10000) { word_count_actual += word_count - last_word_count; last_word_count = word_count; if (debug_mode > 1) { now=clock(); fprintf(stderr, "%cAlpha: %f ME-alpha: %f Progress: %.2f%% Words/thread/sec: %.2fk\t", 13, alpha, maxent_alpha, word_count_actual / (real)(train_words + 1) * 100, word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } } if (feof(fi) || word_count > train_words / num_threads) break; sen[0] = 0; // <s> token -- beginning of sentence int good = 1; sentence_length = 1; while(sentence_length < MAX_SENTENCE_LENGTH) { word = ReadWordIndex(fi); ++word_count; sen[sentence_length] = word; if (feof(fi) || word == 0) break; if (word == -1) good = 0; ++sentence_length; } if(good == 0) continue; if(sentence_length == 1 && feof(fi)) break; memset(neu1e, 0, (long long)layer1_size * sentence_length * sizeof(real)); // clear gradients memset(neu1, 0, (long long)layer1_size * sentence_length * sizeof(real)); // clear activations #ifdef DEBUG real sentence_logprob = 0.0; #endif for(int input = 0; input < sentence_length; ++input) { // Forward pass (not including final softmax) if (input != 0) { for(int c = 0; c < layer1_size; ++c) { for(int d = 0; d < layer1_size; ++d) { neu1[input*layer1_size + c] += nnet.synRec[c*layer1_size + d] * neu1[(input-1)*layer1_size + d]; // Recurrent hidden->hidden activation } } } last_word = sen[input]; for(int c = 0; c < layer1_size; ++c) { neu1[input*layer1_size + c] += nnet.syn0[last_word*layer1_size + c]; // Input to hidden } ApplySigmoid(neu1+layer1_size*input, layer1_size); } for(int target = 1; target <= sentence_length; ++target) { // Forward pass (softmax) word = sen[target]; long long feature_hashes[MAX_NGRAM_ORDER] = {0}; if(maxent_order) { for(int order = 0; order < maxent_order && target - order >= 0; ++order) { feature_hashes[order] = PRIMES[0]*PRIMES[1]; for (int b = 1; b <= order; ++b) feature_hashes[order] += PRIMES[(order*PRIMES[b]+b) % PRIMES_SIZE]*(unsigned long long)(sen[target-b]+1); feature_hashes[order] = feature_hashes[order] % (maxent_hash_size - vocab_size); } } for (d = 0; d < vocab[word].codelen; d++) { // Propagate hidden -> output f = 0.0; l2 = vocab[word].point[d] * layer1_size; for(int c = 0; c < layer1_size; ++c) { f += neu1[layer1_size*(target - 1) + c] * nnet.syn1[l2 + c]; } for(int order = 0; order < maxent_order && target - order >= 0; ++order) { f += nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]]; } #ifdef DEBUG sentence_logprob += log10(1+(vocab[word].code[d] == 1 ? exp(f) : exp(-f))); #endif f = exp(f)/(1+exp(f)); // sigmoid g = (1 - vocab[word].code[d] - f); g = g > MAX_GRAD ? MAX_GRAD : g; g = g < MIN_GRAD ? MIN_GRAD : g; real g_alpha = g * alpha; // 'g_alpha' is the gradient multiplied by the learning rate real g_maxentalpha = g * maxent_alpha; // Propagate errors output -> hidden for(int c = 0; c < layer1_size; ++c) { neu1e[layer1_size * (target - 1) + c] += g_alpha * nnet.syn1[l2 + c]; } // Learn weights hidden -> output for(int c = 0; c < layer1_size; ++c) { nnet.syn1[l2 + c] += g_alpha * neu1[layer1_size*(target - 1) + c] - beta * nnet.syn1[l2 + c]; } for(int order = 0; order < maxent_order && target - order >= 0; ++order) { nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]] += g_maxentalpha - maxent_beta * nnet.synMaxent[feature_hashes[order] + vocab[word].point[d]]; } } } #ifdef DEBUG my_sumlogprob += sentence_logprob; #endif // Backpropagation through time pass int my_bptt = 0; for(int input = sentence_length - 1; input >= 0; --input) { MultiplySigmoidDerivative(neu1+layer1_size*input, layer1_size, neu1e+layer1_size*input); last_word = sen[input]; for(int c = 0; c < layer1_size; ++c) { nnet.syn0[layer1_size*last_word + c] += neu1e[layer1_size*input + c] - beta * nnet.syn0[layer1_size*last_word + c]; // Input weight update } long long word_num = word_count - (input - sentence_length); if(full_block == 0 || word_num % full_block == 0) { my_bptt = bptt; } if(input > 0 && (bptt == 0 || my_bptt > 0 )) { // Work with recurrent weights: backpropagate for(int c = 0; c < layer1_size; ++c) { for(int d = 0; d < layer1_size; ++d) { neu1e[(input-1)*layer1_size + d] += nnet.synRec[c*layer1_size + d] * neu1e[input*layer1_size + c]; // Recurrent hidden->hidden backprop } } --my_bptt; } } // End BPTT loop for(int input = sentence_length - 1; input > 0; --input) { // Work with recurrent weights: update for(int c = 0; c < layer1_size; ++c) { for(int d = 0; d < layer1_size; ++d) { nnet.synRec[c*layer1_size + d] += neu1e[input*layer1_size + c] * neu1[(input-1)*layer1_size + d] - beta * nnet.synRec[c*layer1_size + d]; // Recurrent hidden->hidden weight update } } } } // End main training loop #ifdef DEBUG if((long long)id == 0) fprintf(stderr, "Train Entropy (thread %lld, word count %lld) %f\t", (long long)id, word_count, my_sumlogprob/log10(2)/(real)word_count); #endif fclose(fi); free(neu1); free(neu1e); pthread_exit(NULL); }
void *TrainModelThread(void *id) { long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; long long l1, l2, c, target, label, local_iter = iter; unsigned long long next_random = (long long)id; real f, g; clock_t now; real *neu1 = (real *)calloc(layer1_size, sizeof(real)); real *neu1e = (real *)calloc(layer1_size, sizeof(real)); FILE *fi = fopen(train_file, "rb"); fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); while (1) { if (word_count - last_word_count > 10000) { word_count_actual += word_count - last_word_count; last_word_count = word_count; if ((debug_mode > 1)) { now = clock(); printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, word_count_actual / (real)(iter * train_words + 1) * 100, word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; } if (sentence_length == 0) { while (1) { word = ReadWordIndex(fi); if (feof(fi)) break; if (word == -1) continue; word_count++; if (word == 0) break; // The subsampling randomly discards frequent words while keeping the // ranking same if (sample > 0) { real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; next_random = next_random * (unsigned long long)25214903917 + 11; if (ran < (next_random & 0xFFFF) / (real)65536) continue; } sen[sentence_length] = word; sentence_length++; if (sentence_length >= MAX_SENTENCE_LENGTH) break; } sentence_position = 0; } if (feof(fi) || (word_count > train_words / num_threads)) { word_count_actual += word_count - last_word_count; local_iter--; if (local_iter == 0) break; word_count = 0; last_word_count = 0; sentence_length = 0; fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); continue; } word = sen[sentence_position]; if (word == -1) continue; for (c = 0; c < layer1_size; c++) neu1[c] = 0; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; next_random = next_random * (unsigned long long)25214903917 + 11; b = next_random % window; if (cbow) { // train the cbow architecture // in -> hidden cw = 0; for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; cw++; } if (cw) { for (c = 0; c < layer1_size; c++) neu1[c] /= cw; if (hs) for (d = 0; d < vocab[word].codelen; d++) { f = 0; l2 = vocab[word].point[d] * layer1_size; // Propagate hidden -> output for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; if (f <= -MAX_EXP) continue; else if (f >= MAX_EXP) continue; else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; // 'g' is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] - f) * alpha; // Propagate errors output -> hidden for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; // Learn weights hidden -> output for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word; label = 1; } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; } l2 = target * layer1_size; f = 0; for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; } // hidden -> in for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; } } } else { // train skip-gram for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; l1 = last_word * layer1_size; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; // HIERARCHICAL SOFTMAX if (hs) for (d = 0; d < vocab[word].codelen; d++) { f = 0; l2 = vocab[word].point[d] * layer1_size; // Propagate hidden -> output for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; if (f <= -MAX_EXP) continue; else if (f >= MAX_EXP) continue; else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; // 'g' is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] - f) * alpha; // Propagate errors output -> hidden for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; // Learn weights hidden -> output for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word; label = 1; } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; } l2 = target * layer1_size; f = 0; for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; } // Learn weights input -> hidden for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; } }
// learning: hs (hierarchical softmax) v.s. negative sampling // model: cbow v.s. skip gram void *TrainModelThread(void *id) { // word 向sen中添加单词用,句子完成后表示句子中的当前单词 // last_word 上一个单词,辅助扫描窗口 // sentence_length 当前句子的长度(单词数) // sentence_position 当前单词在当前句子中的index long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0; // word_count 已训练语料总长度 // last_word_count 保存值,以便在新训练语料长度超过某个值时输出信息 // sen 单词数组,表示句子 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; // l1 ns中表示word在concatenated word vectors中的起始位置,之后layer1_size是对应的word vector, //因为把矩阵拉成长向量了 // l2 cbow或ns中权重向量的起始位置,之后layer1_size是对应的syn1或syn1neg,因为把矩阵拉成长向量了 // c 循环中的计数作用 // target ns中当前的sample // label ns中当前sample的label long long l1, l2, c, target, label; // id 线程创建的时候传入,辅助随机数生成 unsigned long long next_random = (long long) id; // f e^x / (1/e^x),fs中指当前编码为是0(父亲的左子节点为0,右为1)的概率, // ns中指label是1的概率 // g 误差(f与真实值的偏离)与学习速率的乘积 real f, g; // function and gradient clock_t now; // 隐层节点 real * neu1 = (real *)calloc(layer1_size, sizeof(real)); // 误差累计项,其实对应的是Gneu1 real * neu1e = (real *)calloc(layer1_size, sizeof(real)); // 将文件内容分配给各个线程 FILE * fi = fopen(train_file, "rb"); fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); while (1) { if (word_count - last_word_count > 10000) { word_count_actual += word_count - last_word_count; last_word_count = word_count; if (debug_mode > 1) { now = clock(); printf("%cAlpah: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, word_count_actual / (real)(train_words + 1) * 100, word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } alpha = starting_alpha * (1 - word_count_actual / (real)(train_words + 1)); // 自动调整学习速率 if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;// 学习速率有下限 } if (sentence_length == 0) {// 如果当前句子长度为0 while(1) { word = ReadWordIndex(fi); if (feof(fi)) break;// 读到文件末尾 if (word == -1) continue;// 没有这个单词 word_count++;// 单词计数增加 if (word == 0) break;// 是个回车 // 这里的亚采样是指 Sub-Sampling,Mikolov 在论文指出这种亚采样能够带来 2 到 10 倍的性能提升,并能够提升低频词的表示精度。 // 低频词被丢弃概率低,高频词被丢弃概率高 if (sample > 0) { real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; next_random = next_random * (unsigned long long)25214903917 + 11; if (ran < (next_random & 0xFFFF) / (real)65536) continue; } sen[sentence_length] = word; sentence_length++; if (sentence_length >= MAX_SENTENCE_LENGTH) break; } sentence_position = 0;// 当前单词在当前句中的index,起始值为0 } // 照应while中的break,如果读到末尾,退出 if (feof(fi)) break; // 已经做到了一个thread应尽的工作量,就退出 if (word_count > train_words / num_threads) break; // 取句子中的第一个单词,开始运行BP算法 word = sen[sentence_position]; if (word == -1) continue; // 隐层节点值和隐层节点误差累计项清零 for (c = 0; c < layer1_size; c++) neu1[c] = 0; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; next_random = next_random * (unsigned long long)25214903917 + 11; // b是个随机数,0到window-1,指定了本次算法操作实际的窗口大小 b = next_random % window; if (cbow) { // train the cbow architecture - HS or NS // IN -> HIDDEN // 将窗口内的word vectors累加到隐层节点上 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue;// 这个单词没有 for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; } // HIERARCHICAL SOFTMAX //当前中心词在哈弗曼数中从根节点到该节点的路径上的每一个点和neu1结合做sigmoid二分类 //label为1-code[j] if (hs) for (d = 0; d < vocab[word].codelen; d++) { // 这里的codelen其实是少一个的,所以不会触及point里面最后一个负数 f = 0; l2 = vocab[word].point[d] * layer1_size; // propagate hidden -> output // 准备计算f for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; // 不在expTable内的舍弃掉,比较暴力 if (f <= -MAX_EXP) continue; else if (f >= MAX_EXP) continue; // 从expTable中查找,快速计算 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; // g is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] -f) * alpha; // propogate errors output -> hidden // 记录累积误差项 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; // learning weights hidden -> output // 更新隐层到霍夫曼树非叶节点的权重 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { // 当前词的分类器应当输出1 target = word; label = 1; } else { // 采样使得与target不同,不然continue,label为0,也即最多采样negative个negative sample next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; } l2 = target * layer1_size; f = 0; for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; // 这里直接上0、1,没有考虑计算精度问题…… if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; } // hidden -> in // 根据隐层节点累积误差项,更新word vectors for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; } } else { //train skip-gram for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { // 预测非中心的单词(邻域内的单词) c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; l1 = last_word * layer1_size; // 累计误差项清零 for (c = 0; c < layer1_size; c++) neu1e[c] = 0; // HIERARCHICAL SOFTMAX if (hs) for (d = 0; d < vocab[word].codelen; d++) { f = 0; l2 = vocab[word].point[d] * layer1_size; // Propagate hidden -> output // 待预测单词的 word vecotr 和 隐层-霍夫曼树非叶节点权重 的内积 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; // 同cbow中hs的讨论 // if (f <= -MAX_EXP) continue; // else if (f >= MAX_EXP) continue; if (f <= -MAX_EXP) f = 0; else if (f >= MAX_EXP) f = 1; // 以下内容同之前的cbow else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; // 'g' is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] - f) * alpha; // 这里的code[d]其实是下一层的,code错位了,point和code是错位的! // Propagate errors output -> hidden for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; // Learn weights hidden -> output for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word; label = 1; } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; } l2 = target * layer1_size; f = 0; for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; // 以下内容同之前的cbow if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; } // Learn weights input -> hidden for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; } }
//这个线程函数执行之前,已经做好了一些工作:根据词频排序的词汇表,每个单词的huffman编码 void *TrainModelThread(void *id) { long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0; long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; long long l1, l2, c, target, label; unsigned long long next_random = (long long)id; real f, g; clock_t now; real *neu1 = (real *)calloc(layer1_size, sizeof(real)); real *neu1e = (real *)calloc(layer1_size, sizeof(real)); FILE *fi = fopen(train_file, "rb"); //每个线程对应一段文本。根据线程id找到自己负责的文本的初始位置 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); while (1) { if (word_count - last_word_count > 10000) { word_count_actual += word_count - last_word_count; last_word_count = word_count; if ((debug_mode > 1)) { now=clock(); printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, word_count_actual / (real)(train_words + 1) * 100, word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } alpha = starting_alpha * (1 - word_count_actual / (real)(train_words + 1)); if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; } if (sentence_length == 0) { while (1) { word = ReadWordIndex(fi);//从文件流中读取一个词,并返回这个词在词汇表中的位置 if (feof(fi)) break; if (word == -1) continue; word_count++; if (word == 0) break; // The subsampling randomly discards frequent words while keeping the ranking same if (sample > 0)//对高频词进行下采样,不过要保持排序不变。 { real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; next_random = next_random * (unsigned long long)25214903917 + 11; if (ran < (next_random & 0xFFFF) / (real)65536) continue; } sen[sentence_length] = word; sentence_length++; //1000个单词视作一个句子? if (sentence_length >= MAX_SENTENCE_LENGTH) break; } sentence_position = 0; } if (feof(fi)) break; if (word_count > train_words / num_threads) break;//如果当前线程已处理的单词超过了 阈值,则退出。 word = sen[sentence_position]; if (word == -1) continue; for (c = 0; c < layer1_size; c++) neu1[c] = 0; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; next_random = next_random * (unsigned long long)25214903917 + 11; b = next_random % window; if (cbow) { //train the cbow architecture // in -> hidden for (a = b; a < window * 2 + 1 - b; a++) if (a != window)//扫描目标单词的左右几个单词 { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++)//layer1_size词向量的维度,默认值是100 neu1[c] += syn0[c + last_word * layer1_size];//传说中的向量和? } if (hs) for (d = 0; d < vocab[word].codelen; d++)//开始遍历huffman树,每次一个节点 { f = 0; l2 = vocab[word].point[d] * layer1_size;//point应该记录的是huffman的路径。找到当前节点,并算出偏移 // Propagate hidden -> output for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];//计算内积 if (f <= -MAX_EXP) continue;//内积不在范围内直接丢弃 else if (f >= MAX_EXP) continue; else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];//内积之后sigmoid函数 // 'g' is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] - f) * alpha;//偏导数的一部分 //layer1_size是向量的维度 // Propagate errors output -> hidden 反向传播误差,从huffman树传到隐藏层。下面就是把当前内节点的误差传播给隐藏层,syn1[c + l2]是偏导数的一部分。 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; // Learn weights hidden -> output 更新当前内节点的向量,后面的neu1[c]其实是偏导数的一部分 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word;//目标单词 label = 1;//正样本 } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0;//负样本 } l2 = target * layer1_size; f = 0; for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];//内积 if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];//隐藏层的误差 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];//更新负样本向量 } // hidden -> in for (a = b; a < window * 2 + 1 - b; a++) if (a != window)//cbow模型 更新的不是中间词语的向量,而是周围几个词语的向量。 { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];//更新词向量 } } else { //train skip-gram for (a = b; a < window * 2 + 1 - b; a++) if (a != window)//扫描周围几个词语 { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; l1 = last_word * layer1_size; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; // HIERARCHICAL SOFTMAX if (hs) for (d = 0; d < vocab[word].codelen; d++)//遍历叶子节点 { f = 0; l2 = vocab[word].point[d] * layer1_size;//point记录的是huffman的路径 // Propagate hidden -> output 感觉源代码这个英语注释有点误导人,这里的隐藏层就是输入层,就是词向量。 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];//计算两个词向量的内积 if (f <= -MAX_EXP) continue; else if (f >= MAX_EXP) continue; else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; // 'g' is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] - f) * alpha;//偏导数的一部分 // Propagate errors output -> hidden for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];//隐藏层的误差 // Learn weights hidden -> output for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];//更新叶子节点向量 } // NEGATIVE SAMPLING if (negative > 0)//这个同cobow差不多 for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word; label = 1; } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; } l2 = target * layer1_size; f = 0; for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; } // Learn weights input -> hidden for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];//更新周围几个词语的向量 } }
// 训练模型线程:训练过程 void *TrainModelThread(void *id) { long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0; long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; long long l1, l2, c, target, label, local_iter = iter; unsigned long long next_random = (long long)id; real f, g; clock_t now; real *neu1 = (real *)calloc(layer1_size, sizeof(real)); //对应Xw real *neu1e = (real *)calloc(layer1_size, sizeof(real)); //对应error累加量 FILE *fi = fopen(train_file, "rb"); //每个线程对应一段文本 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); while (1) { if (word_count - last_word_count > 10000) { word_count_actual += word_count - last_word_count; last_word_count = word_count; if ((debug_mode > 1)) { now=clock(); printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, word_count_actual / (real)(iter * train_words + 1) * 100, word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; } if (sentence_length == 0) { while (1) { word = ReadWordIndex(fi); //读一个词,返回其在词汇表的索引位置 if (feof(fi)) break; if (word == -1) continue; word_count++; if (word == 0) break; // The subsampling randomly discards frequent words while keeping the ranking same // 对高频词进行下采样,以概率p丢弃。p = 1-[sqrt(t/f(w))+t/f(w)].但仍保持排序不变 // 先计算ran = sqrt(t/f(w))+t/f(w),产生(0,1)上的随机数r,如果r>ran,则丢弃。 if (sample > 0) { real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; next_random = next_random * (unsigned long long)25214903917 + 11; if (ran < (next_random & 0xFFFF) / (real)65536) continue; } sen[sentence_length] = word; sentence_length++; // 将1000个词当成一个句子 if (sentence_length >= MAX_SENTENCE_LENGTH) break; } sentence_position = 0; } // 当前线程处理单词数超过阈值 if (feof(fi) || (word_count > train_words / num_threads)) { word_count_actual += word_count - last_word_count; local_iter--; if (local_iter == 0) break; word_count = 0; last_word_count = 0; sentence_length = 0; fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); continue; } word = sen[sentence_position]; if (word == -1) continue; for (c = 0; c < layer1_size; c++) neu1[c] = 0; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; next_random = next_random * (unsigned long long)25214903917 + 11; // 随机产生0-5的窗口大小 b = next_random % window; if (cbow) { //train the cbow architecture // in -> hidden cw = 0; for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; // 上下文词进行向量加和,得到Xw for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; cw++; } if (cw) { // 归一化? for (c = 0; c < layer1_size; c++) neu1[c] /= cw; // hs,采用huffman if (hs) for (d = 0; d < vocab[word].codelen; d++) { f = 0; l2 = vocab[word].point[d] * layer1_size; //路径的内部节点 // Propagate hidden -> output // 隐藏层到输出层,计算误差梯度 // neu1 对应 Xw, syn1对应内部节点的向量0 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; //计算内积 if (f <= -MAX_EXP) continue; else if (f >= MAX_EXP) continue; else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];//sigmoid // 'g' is the gradient multiplied by the learning rate // 内部节点0的梯度(1-d-sigmoid(Xw·0))Xw,g为前面部分 g = (1 - vocab[word].code[d] - f) * alpha; // Propagate errors output -> hidden // 反向传播误差,从huffman树传到隐藏层 // 累加的梯度更新量 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; // Learn weights hidden -> output // 内部节点更新向量 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word; //目标词 label = 1; //正样本 } else {//采样负样本 next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; } l2 = target * layer1_size; f = 0; for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; //内积 if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; //sigmoid for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; //累积误差梯度 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; //参数向量更新 } // hidden -> in // 更新上下文几个词语的向量。 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; } } } else { //train skip-gram for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; l1 = last_word * layer1_size; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; // HIERARCHICAL SOFTMAX if (hs) for (d = 0; d < vocab[word].codelen; d++) { //遍历叶子节点 f = 0; l2 = vocab[word].point[d] * layer1_size; //point是路径上的节点 // Propagate hidden -> output for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; //内积 if (f <= -MAX_EXP) continue; else if (f >= MAX_EXP) continue; else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; //sigmoid // 'g' is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] - f) * alpha; //梯度一部分 // Propagate errors output -> hidden for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; //隐藏层的误差 // Learn weights hidden -> output for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; //更新内部节点向量 } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word; label = 1; } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; } l2 = target * layer1_size; f = 0; for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; } // Learn weights input -> hidden for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; //更新中心词向量 } }
void *TrainCBOWModelThread(void *arg) { /*Get Parameters*/ threadParameters *params = arg; vocabulary *voc = params->voc; //shared int id = params->threadNumber; int MAX_EXP = params->max_exp; int layer1_size = params->layer1_size; int num_threads = params->num_threads; int file_size = params->file_size; int window = params->window; int hs = params->hs; int negative = params->negative; int EXP_TABLE_SIZE = params->exp_table_size; int table_size = params->table_size; long long int *word_count_actual = params->word_count_actual; //shared int *table = params->table; char *train_file = params->train_file; real starting_alpha = params->starting_alpha; real sample = params->sample; real *alpha = params->alpha; //shared real *syn0 = params->syn0; //shared real *syn1 = params->syn1; //shared real *syn1neg = params->syn1neg; //shared real *expTable = params->expTable; //shared free(arg); //arg is not needed anymore long long a, b, d, word, last_word, sentence_length = 0, sentence_position = 0; long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; long long l2, c, target, label; unsigned long long next_random = (long long)id; real f, g; clock_t now; int start = 0; real *neu1 = (real *)calloc(layer1_size, sizeof(real)); //one vector real *neu1e = (real *)calloc(layer1_size, sizeof(real)); FILE *fi = fopen(train_file, "rb"); fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); while (1) { if (word_count - last_word_count > 10000) { (*word_count_actual) += word_count - last_word_count; last_word_count = word_count; if ((DEBUG_MODE > 1)) { now=clock(); printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, (*alpha), (*word_count_actual) / (real)(voc->train_words + 1) * 100, (*word_count_actual) / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } (*alpha) = starting_alpha * (1 - (*word_count_actual) / (real)(voc->train_words + 1)); if ((*alpha) < starting_alpha * 0.0001) (*alpha) = starting_alpha * 0.0001; } if (sentence_length == 0) { while (1) { if (feof(fi)) break; word = ReadWordIndex(voc,fi); if (word == -1) continue; word_count++; if (word == 0) break; // The subsampling randomly discards frequent words while keeping the ranking same if (sample > 0) { real ran = (sqrt(voc->vocab[word].cn / (sample * voc->train_words)) + 1) * (sample * voc->train_words) / voc->vocab[word].cn; next_random = next_random * (unsigned long long)25214903917 + 11; if (ran < (next_random & 0xFFFF) / (real)65536) continue; } sen[sentence_length] = word; sentence_length++; if (sentence_length >= MAX_SENTENCE_LENGTH) break; } sentence_position = 0; } if (feof(fi)) //end file break; if (word_count > voc->train_words / num_threads) //trained all word break; word = sen[sentence_position]; //index if (word == -1) continue; for (c = 0; c < layer1_size; c++) neu1[c] = 0; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; next_random = next_random * (unsigned long long)25214903917 + 11; b = next_random % window; /*--- Training ---*/ // in -> hidden for (a = b; a < window * 2 + 1 - b; a++) //a = [0 window]->[(window*2+1)-rand] -> dynamic window if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; //index of word if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) // c is each vector index neu1[c] += syn0[c + last_word * layer1_size]; //sum of all vectors in input window (fig cbow) -> vectors on hidden } if (hs) for (d = 0; d < voc->vocab[word].codelen; d++) { f = 0; l2 = voc->vocab[word].point[d] * layer1_size; //offset of word // Propagate hidden -> output for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; //sum vectors input window * word weights on syn1 -> output vectors if (f <= -MAX_EXP) //sigmoid activation function - precalculated in expTable continue; else if (f >= MAX_EXP) continue; else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; // 'g' is the gradient multiplied by the learning rate g = (1 - voc->vocab[word].code[d] - f) * (*alpha); // Propagate errors output -> hidden for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; //save to modify vectors // Learn weights hidden -> output for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c]; //modify weights } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word; label = 1; //(w,c) in corpus } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (voc->vocab_size - 1) + 1; if (target == word) continue; label = 0; //(w,c) not in corpus } l2 = target * layer1_size; //get word vector index f = 0; for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; //vector*weights if (f > MAX_EXP) //sigmoid g = (label - 1) * (*alpha); else if (f < -MAX_EXP) g = (label - 0) * (*alpha); else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * (*alpha); for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; //saving error for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c]; } // hidden -> in for (a = b; a < window * 2 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; //modify word vectors with error } sentence_position++; if (sentence_position >= sentence_length) { sentence_length = 0; continue; } } fclose(fi); free(neu1); free(neu1e); pthread_exit(NULL); }
void *TrainModelThread(void *id) { long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0, sen_count = 0, layer1_size_all = 0; long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; long long l1, l2, c, target, label, local_iter = iter; unsigned long long next_random = (long long)id; real f, g; clock_t now; real *neu1, *neu1e; if (global_image) { neu1 = (real *)calloc(layer1_size + layer1_image_size, sizeof(real)); neu1e = (real *)calloc(layer1_size + layer1_image_size, sizeof(real)); } else { neu1 = (real *)calloc(layer1_size, sizeof(real)); neu1e = (real *)calloc(layer1_size, sizeof(real)); } FILE *fi = fopen(train_file, "rb"); fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); while (1) { if (word_count - last_word_count > 10000) { word_count_actual += word_count - last_word_count; last_word_count = word_count; if ((debug_mode > 1)) { now=clock(); printf("%cAlpha: %f Progress: %.3f%% Words/thread/sec: %.2fk ", 13, alpha, word_count_actual / (real)(iter * train_words + 1) * 100, word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1)); if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001; } // Take one sentence/paragraph in sen, iterate over each word in the sentence if (sentence_length == 0) { while (1) { word = ReadWordIndex(fi); if (feof(fi)) break; if (word == -1) continue; word_count++; if (word == 0) break; // The subsampling randomly discards frequent words while keeping the ranking same if (sample > 0) { real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn; next_random = next_random * (unsigned long long)25214903917 + 11; if (ran < (next_random & 0xFFFF) / (real)65536) continue; } // put words in a sentence in sen sen[sentence_length] = word; sentence_length++; if (sentence_length >= MAX_SENTENCE_LENGTH) break; } if (!feof(fi)) sen_count++; //printf("Id, sentence num/length is %lld, %lld,%lld, %lld\n", (long long)id,sen_count,sentence_length, feof(fi)); sentence_position = 0; } if (feof(fi) || (word_count > train_words / num_threads)) { word_count_actual += word_count - last_word_count; local_iter--; printf("id/ word count/ word_count_actual / iter/: %lld,%lld, %lld, %lld\n", (long long)id, word_count, word_count_actual, local_iter); // End training if (local_iter == 0) break; word_count = 0; last_word_count = 0; sentence_length = 0; // Only single thread set sen_count = 0 sen_count = 0; fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET); continue; } //printf("Sentence position is %lld\n", sentence_position); word = sen[sentence_position]; if (word == -1) continue; if (global_image) { for (c = 0; c < layer1_size + layer1_image_size; c++) neu1[c] = 0; for (c = 0; c < layer1_size + layer1_image_size; c++) neu1e[c] = 0; } else { for (c = 0; c < layer1_size; c++) neu1[c] = 0; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; } next_random = next_random * (unsigned long long)25214903917 + 11; b = next_random % window; if (cbow) { //train the cbow architecture // in -> hidden cw = 0; for (a = b; a < window * 1 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; // start from 1 as word vector, 0 is for sentence/paragraph if (sentence_vectors && (c == 0)) continue; last_word = sen[c]; if (last_word == -1) continue; // Sum over word vectors in local windows for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; cw++; } // To train sentence vectors, sentence vector id is in sen[0]. if (sentence_vectors) { last_word = sen[0]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size]; cw++; } if (cw) { // average of word vectors in a sentence for (c = 0; c < layer1_size; c++) neu1[c] /= cw; if (global_image) { for (c = layer1_size; c < layer1_size + layer1_image_size; c++) neu1[c] += syn0_im[c + (sen_count-1) * layer1_image_size]; } if (hs) for (d = 0; d < vocab[word].codelen; d++) { if (global_image) layer1_size_all = layer1_size + layer1_image_size; else layer1_size_all = layer1_size; // for each node along the path f = 0; // l2 is the location of all points in the path of Huffman Tree towards word l2 = vocab[word].point[d] * layer1_size_all; // Propagate hidden -> output // neul1, average wordvec; syn1, model; neul0, original wordvec //for (c = 0; c < layer1_size_all; c++) f += neu1[c] * syn1[c + l2]; // apply different weight from image and word for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; if (global_image) { for (c = layer1_size; c < layer1_size_all; c++) f += factor_image * neu1[c] * syn1[c + l2]; } if (f <= -MAX_EXP) continue; else if (f >= MAX_EXP) continue; // f = sigmod(neu1*syn1) else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; // 'g' is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] - f) * alpha; //printf("d is %f\n", 1- vocab[word].code[d]); //printf("f and gt, and g is %f, %f\n", f, g); // Propagate errors output -> hidden for (c = 0; c < layer1_size_all; c++) neu1e[c] += g * syn1[c + l2]; // Learn weights hidden -> output for (c = 0; c < layer1_size_all; c++) syn1[c + l2] += g * neu1[c]; } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (global_image) layer1_size_all = layer1_size + layer1_image_size; else layer1_size_all = layer1_size; if (d == 0) { target = word; label = 1; } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; } l2 = target * layer1_size_all; f = 0; //for (c = 0; c < layer1_size_all; c++) f += neu1[c] * syn1neg[c + l2]; for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2]; if (global_image) { for (c = layer1_size; c < layer1_size_all; c++) f += factor_image * neu1[c] * syn1neg[c + l2]; } if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size_all; c++) neu1e[c] += g * syn1neg[c + l2]; for (c = 0; c < layer1_size_all; c++) syn1neg[c + l2] += g * neu1[c]; } // hidden -> in for (a = b; a < window * 1 + 1 - b; a++) if (a != window) { c = sentence_position - window + a; if (c < 0) continue; if (c >= sentence_length) continue; // back propagation, skip the sentence vector if (sentence_vectors && (c == 0)) continue; last_word = sen[c]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; } // For every word in the sentence, update the sentence vector if (sentence_vectors) { last_word = sen[0]; if (last_word == -1) continue; for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c]; } if (global_image) { for (c = layer1_size; c < layer1_size + layer1_image_size; c++) syn0_im[c + (sen_count-1) * layer1_image_size] += neu1e[c]; } } } else { //train skip-gram for (a = b; a < window * 2 + 1 + sentence_vectors - b; a++) if (a != window) { c = sentence_position - window + a; if (sentence_vectors) if (a >= window * 2 + sentence_vectors - b) c = 0; if (c < 0) continue; if (c >= sentence_length) continue; last_word = sen[c]; if (last_word == -1) continue; l1 = last_word * layer1_size; for (c = 0; c < layer1_size; c++) neu1e[c] = 0; // HIERARCHICAL SOFTMAX if (hs) for (d = 0; d < vocab[word].codelen; d++) { if (global_image) layer1_size_all = layer1_size + layer1_image_size; else layer1_size_all = layer1_size; f = 0; l2 = vocab[word].point[d] * layer1_size_all; // Propagate hidden -> output for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2]; if (global_image) { for (c = layer1_size; c < layer1_size_all; c++) f += factor_image * neu1[c] * syn1[c + l2]; } //for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2]; if (f <= -MAX_EXP) continue; else if (f >= MAX_EXP) continue; else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]; // 'g' is the gradient multiplied by the learning rate g = (1 - vocab[word].code[d] - f) * alpha; // Propagate errors output -> hidden for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2]; // Learn weights hidden -> output for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1]; } // NEGATIVE SAMPLING if (negative > 0) for (d = 0; d < negative + 1; d++) { if (d == 0) { target = word; label = 1; } else { next_random = next_random * (unsigned long long)25214903917 + 11; target = table[(next_random >> 16) % table_size]; if (target == 0) target = next_random % (vocab_size - 1) + 1; if (target == word) continue; label = 0; } l2 = target * layer1_size; f = 0; for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2]; if (f > MAX_EXP) g = (label - 1) * alpha; else if (f < -MAX_EXP) g = (label - 0) * alpha; else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha; for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2]; for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1]; } // Learn weights input -> hidden for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c]; } }