void FNeuralNetLMBase::BatchSGDTrain(FNNLMDataReader &train_data, FNNLMDataReader &validation_data, const string &outbase, bool nce_ppl) { const size_t eos_widx = word_vocab_.eos_idx(); const size_t unk_widx = word_vocab_.unk_idx(); const vector<size_t> eos_fidx = { factor_vocab_.eos_idx() }; vector<pair<size_t, vector<size_t>>> sentence; double last_logp = -numeric_limits<double>::max(); double curr_logp = -numeric_limits<double>::max(); bool halve_alpha = false; // set the current learning rate. float curr_learning_rate = algopts_.init_learning_rate_; size_t sents_processed = 0; int iteration = 0; clock_t start_time = clock(); clock_t end_time = start_time; while (true) { cout << "******************************* ITERATION " << iteration++ << " *******************************" << endl; train_data.StartEpoch(); ResetActivations(); cout << "learning_rate = " << curr_learning_rate << endl; int bpos = 0; double logp = 0.0; nce_obj_ = 0; size_t ivcount = 0; size_t oovcount = 0; // NOTE: the vector "sentence" does not include </s> at the end! while (train_data.GetSentence(sentence)) { assert(!sentence.empty()); if (independent_) { ResetActivations(); } ForwardPropagate(eos_widx, eos_fidx); for (vector<pair<size_t, vector<size_t>>>::const_iterator it = sentence.begin(); it != sentence.end(); ++it) { // train all words even if it is an OOV since <unk> in the vocabulary if (!unk_ && it->first == unk_widx) { oovcount++; } else { logp += GetLogProb(it->first, !nce_); ivcount++; } BackPropagate(it->first, it->second); if (++bpos == algopts_.batch_size_) { FastUpdateWeightsMajor(curr_learning_rate); bpos = 0; } ForwardPropagate(it->first, it->second); } if (nce_) { logp += GetLogProb(eos_widx, false); } else { logp += GetLogProb(eos_widx, true); } ivcount++; BackPropagate(eos_widx, eos_fidx); sents_processed++; if ((sents_processed % 500) == 0) cout << "." << flush; } // Do the update for current epoch since the last minibatch. FastUpdateWeightsMajor(curr_learning_rate); bpos = 0; FastUpdateWeightsMinor(); cout << "\nnum IV words (including </s>) in training: " << ivcount << endl; cout << "number of OOV words in training: " << oovcount << endl; if (!nce()) { cout << "training entropy (base 2): " << -logp / log(2) / ivcount << endl; cout << "model perplexity on training: " << exp(-logp / ivcount) << endl; cout << "log-likelihood (base e) on training is: " << logp << endl; } else { cout << "NCE objective value on training is: " << nce_obj_ << endl; cout << "un-normalized training entropy (base 2): " << -logp / log(2) / ivcount << endl; cout << "unnormalied model perplexity on training: " << exp(-logp / ivcount) << endl; cout << "un-normalized log-likelihood (base e) on training is: " << logp << endl; } cout << "epoch finished" << endl << flush; if (!outbase.empty()) { if (debug_ > 0) { WriteLM(outbase + ".ITER_" + to_string(iteration - 1)); } } cout << "----------VALIDATION----------" << endl; double curr_logp = EvalLM(validation_data, nce_ppl); cout << "log-likelihood (base e) on validation is: " << curr_logp << endl; clock_t last_end_time = end_time; end_time = clock(); cout << "time elasped " << static_cast<double>(end_time - last_end_time) / CLOCKS_PER_SEC << " secs for this iteration out of " << static_cast<double>(end_time - start_time) / CLOCKS_PER_SEC << " secs in total." << endl; if (curr_logp < last_logp) { cout << "validation log-likelihood decrease; resetting parameters" << endl; RestoreLastParams(); } else { CacheCurrentParams(); } if (curr_logp * algopts_.min_improvement_ <= last_logp) { if (!halve_alpha) { halve_alpha = true; } else { if (!outbase.empty()) { WriteLM(outbase); } break; } } if (halve_alpha) { curr_learning_rate /= 2; } last_logp = curr_logp; } }
int main(int argc, char **argv) { int i,j,k = 0;//counters if(argc == 1) { //printing instructions printf("\n"); printf("Forward propagation of sentences in a file delimited by \\n\n\n"); printf("Parameters:\n"); printf("\tValue for the vocabulary size that resulted from training (first number in the output file of word2vec):\n"); printf("\t\t-vocab_size <int>\n"); printf("\tValue for the layer size used in training (second number in the output file of word2vec):\n"); printf("\t\t-layer_size <int>\n"); printf("\tValue for the window size:\n"); printf("\t\t-window <int>\n\n"); return 0; } //reading command line arguments if ((i = ArgPos((char *)"-layer_size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-vocab_size", argc, argv)) > 0) vocab_size = atoi(argv[i + 1]); if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]); // allocating memory to store the network elements syn0 = (real *)calloc(layer1_size*vocab_size,sizeof(real)); syn1 = (real *)calloc(layer1_size*vocab_size,sizeof(real)); neu1 = (real *)calloc(layer1_size,sizeof(real)); index_buff = (char *)calloc(MAX_INDEX_BUFF_SIZE,sizeof(char)); // reading the network from file read_syn0(); read_syn1(); expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); //allocating memory for expTable for (i = 0; i < EXP_TABLE_SIZE; i++) { expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table in the same way as in word2vec expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) } //building the vocabulary and the vocabulary hash from the files it was stored in BuildVocabFromFile(); BuildVocabHashFromFile(); int length = 0; //word lenght of sentence variable int syno_length = 0; //how many synonyms/replacements long long * sen; //sentence variable where words are represented as vocabualry indices long long * sen_temp; //temporary sentence variable where words are represented as vocabulary indices sen_temp = (long long *)calloc(MAX_SENTENCE_LENGTH,sizeof(long long)); //allocating memory for sen_temp long long * synonym; //replacement word (in vocabulary index form) long double prob = 0; //probability variable long long ptr = 0, ptr_temp = 0; //pointer used to go through the sentences file long long syno_ptr = 0, syno_ptr_temp = 0; //pointer used to go through the synonyms/replacements file FILE *sentfile = fopen("sentences","r"); FILE *indices = fopen("indices","r"); FILE *synfile = fopen("synonyms","r"); FILE *fo = fopen("wordprobs","w"); int lines = 0; char line[MAX_SENTENCE_LENGTH]; // buffer to store current sentence char synline[MAX_SENTENCE_LENGTH]; // buffer to store synonyms lines = Lines(sentfile); // how many lines in the sentences file, which is used as the outer loop delimiter //(this can be done) since all the files "sentences", "synonyms" and "indices" have the same number of lines delimited by "\n" rewind(sentfile); rewind(synfile); for(i = 0; i<lines; i++) { //outer loop iterating through "sentences", "synonyms" and "indices" line by line // read sentence ptr = ftell(sentfile); // store beginning of line if (readLine(sentfile,line) < 0) break; length = LineWordCount(line); //printf("sent words %d\n",length); // read word replacements syno_ptr = ftell(synfile); // store beginning of line if (readLine(synfile,synline) < 0) break; syno_length = LineWordCount(synline); printf("synline %s\n",synline); fseek(sentfile,ptr,SEEK_SET); // move the pointer back to the beginning of the line sen = FileToSen(length,sentfile); //sen is an array of longs with the words of the sentence in a vocabulary index format fseek(synfile,syno_ptr,SEEK_SET); synonym = FileToSen(syno_length,synfile); //synonym is an array of longs with the replacements/synonyms from the "synonyms" file in vocabulary index format fseek(sentfile,1,SEEK_CUR); // added to get past newline fseek(synfile,1,SEEK_CUR); ReadIndexFromFile(indices); //reads the index and puts it in the char array "index_buff" target_index = GetIndex(); //returns a numerical value from what is in the char array "index_buff" for(k=0; k<syno_length; k++) { //repeats forward propagation for each synonym in the line memcpy(sen_temp,sen,MAX_SENTENCE_LENGTH*sizeof(long long)); //copying the sentence into sen_temp where synonyms will be changed sen_temp[target_index] = synonym[k]; //replacing the target word with a synonym/replacement prob = ForwardPropagate(length,sen_temp); //doing forward propagation to get the probability //prob = prob * 100000; // multiplying the probabilty by 100000 or taking the negative log is done in this line fprintf(fo,"%s %Lf\n",vocab[synonym[k]].word,prob); // SEA the replacement word and its probability } } fclose(fo); fclose(sentfile); fclose(synfile); fclose(indices); return 0; }
double FNeuralNetLMBase::EvalLM(FNNLMDataReader &data, bool nce_ppl) { CheckParams(); if (nce_ppl && !nce_) { cerr << "nce_ppl == true but nce_ is false!" << endl; exit(EXIT_FAILURE); } const size_t eos_widx = word_vocab_.eos_idx(); const size_t unk_widx = word_vocab_.unk_idx(); const vector<size_t> eos_fidx = { factor_vocab_.eos_idx() }; vector<pair<size_t, vector<size_t>>> sentence; double total_logp = 0.0; size_t sents_processed = 0; size_t ivcount = 0; size_t oovcount = 0; data.StartEpoch(); ResetActivations(); while (data.GetSentence(sentence)) { assert(!sentence.empty()); if (independent_) { ResetActivations(); } double curr_logp = 0.0; ForwardPropagate(eos_widx, eos_fidx); for (vector<pair<size_t, vector<size_t>>>::const_iterator it = sentence.begin(); it != sentence.end(); ++it) { if (!unk_ && it->first == unk_widx) { oovcount++; } else { curr_logp += GetLogProb(it->first, !nce_ppl); ivcount++; } ForwardPropagate(it->first, it->second); } curr_logp += GetLogProb(eos_widx, !nce_ppl); ivcount++; total_logp += curr_logp; sents_processed++; if ((sents_processed % 200) == 0) { cout << "." << flush; } if (debug_ > 1) { if (nce_ppl) { cerr << "unnormalized log-likelihood (base e) on " << sents_processed << "-th sentence is: " << curr_logp << endl; } else { cerr << "log-likelihood (base e) on " << sents_processed << "-th sentence is: " << curr_logp << endl; } } } if (ivcount == 0) { cerr << "zero IV words!" << endl; exit(EXIT_FAILURE); } cout << "\nnumber of IV words (including </s>): " << ivcount << endl; cout << "number of OOV words: " << oovcount << endl; cout << "entropy (base 2): " << -total_logp / log(2) / ivcount << endl; if (nce_ppl) { cout << "unnormalied model perplexity: " << exp(-total_logp / ivcount) << endl; } else { cout << "model perplexity: " << exp(-total_logp / ivcount) << endl; } return total_logp; }