void Trainer::TrainIteration(multiverso::DataBlockBase *data_block) { if (process_id_ == -1) process_id_ = multiverso::Multiverso::ProcessRank(); if (trainer_id_ == 0) //Record the starting time of the Trainiteration { fprintf(log_file_, "%lf\n", (clock()) / (double)CLOCKS_PER_SEC); fflush(log_file_); } multiverso::Log::Info("Rank %d [Trainer]------Train %d Begin TrainIteration%d ...\n", process_id_, trainer_id_, train_count_); ++train_count_; //Compute the total number of processes if (process_count_ == -1) process_count_ = multiverso::Multiverso::TotalProcessCount(); DataBlock *data = reinterpret_cast<DataBlock*>(data_block); std::vector<int> input_nodes(data->input_nodes.begin(), data->input_nodes.end()); std::vector<int> output_nodes(data->output_nodes.begin(), data->output_nodes.end()); //A trainer only copy or add apart of parameters //This trainer should copy or add the parameters according to //local_input_nodes and local_output_nodes std::vector<int> local_input_nodes; std::vector<int> local_output_nodes; for (int i = trainer_id_; i < input_nodes.size(); i += option_->thread_cnt) local_input_nodes.push_back(input_nodes[i]); for (int i = trainer_id_; i < output_nodes.size(); i += option_->thread_cnt) local_output_nodes.push_back(output_nodes[i]); if (trainer_id_ == 0) { multiverso::Log::Info("Rank %d [Trainer]------ input_size=%d, output_size=%d, negativesample_size=%d \n", process_id_, input_nodes.size(), output_nodes.size(), data->negativesample_pools.size()); } //Step 1, Copy the parameter from multiverso to WordEmbedding_ //One trainer only copy a part of parameters multiverso::Log::Info("Rank %d [Trainer]------Train %d Copyparameter Begin TrainIteration%d ...\n", process_id_, trainer_id_, train_count_); CopyParameter(local_input_nodes, local_output_nodes); if (trainer_id_ == 0) { multiverso::Row<int64> ©_row = GetRow<int64>(kWordCountActualTableId, 0); WordEmbedding_->word_count_actual = copy_row.At(0); WordEmbedding_->UpdateLearningRate(); } multiverso::Log::Info("Rank %d [Trainer]------Train %d Copyparameter end TrainIteration%d ...\n", process_id_, trainer_id_, train_count_); //Wait for all the trainers to finish copying parameter barrier_->Wait(); //Step 2, After finishing copying parameter, //Use WordEmbedding_ to train a part of data_block int64 last_word_count = word_count; clock_t start = clock(); multiverso::Log::Info("Rank %d [Trainer]------Train %d TrainNN Begin TrainIteration%d ...\n", process_id_, trainer_id_, train_count_); if (trainer_id_ == 0) multiverso::Log::Info("Rank %d [Trainer]------Train %d Datablock's sentence number:%d ...\n", process_id_, trainer_id_, data->Size()); WordEmbedding_->Train(data, trainer_id_, option_->thread_cnt, word_count, hidden_act_, hidden_err_); if (word_count > last_word_count) { multiverso::Log::Info("[Trainer]------TrainNNSpeed: Words/thread/second %lfk\n", ((double)word_count - last_word_count) / (clock() - start) * (double)CLOCKS_PER_SEC / 1000); } multiverso::Log::Info("Rank %d [Trainer]------Train %d TrainNN end TrainIteration%d ...\n", process_id_, trainer_id_, train_count_); //Wait for all the trainers to finish training barrier_->Wait(); multiverso::Log::Info("Rank %d [Trainer]------Train %d AddDeltaParameter Begin TrainIteration%d ...\n", process_id_, trainer_id_, train_count_); //Step 3, After finishing training, add the delta of parameters to multiverso AddDeltaParameter(local_input_nodes, local_output_nodes); if (trainer_id_ == 0) { multiverso::Row<int64> ©_row = GetRow<int64>(kWordCountActualTableId, 0); Add<int64>(kWordCountActualTableId, 0, 0, WordEmbedding_->word_count_actual - copy_row.At(0)); } multiverso::Log::Info("Rank %d [Trainer]------Train %d AddDeltaParameter end TrainIteration%d ...\n", process_id_, trainer_id_, train_count_); //If the data_block is the last one,Dump the input-embedding weights if (data->Type() == DataBlockType::Test && trainer_id_ == 0) { SaveEmbedding(option_->output_file, option_->output_binary); } if (data->Type() == DataBlockType::Test && process_id_ == 0 && trainer_id_ == 0) { SaveEmbedding("tmp.bin", 1); char s[128] = { 0 }; sprintf_s(s, "check.py tmp.bin %d >> records.txt", clock() / CLOCKS_PER_SEC); system(s); } if (trainer_id_ == 0) { fprintf(log_file_, "%lf\n", (clock()) / (double)CLOCKS_PER_SEC); fflush(log_file_); } }
//Train one datablock void Trainer<T>::TrainIteration(multiverso::DataBlockBase *data_block) { if (m_train_count == 0) { m_start_time = clock(); m_process_id = multiverso::Multiverso::ProcessRank(); } printf("Rank %d Begin TrainIteration...%d\n", m_process_id, m_train_count); clock_t train_interation_start = clock(); fflush(stdout); m_process_count = multiverso::Multiverso::TotalProcessCount(); DataBlock *data = reinterpret_cast<DataBlock*>(data_block); SkipGramMixtureNeuralNetwork<T>* word2vector_neural_network = reinterpret_cast<SkipGramMixtureNeuralNetwork<T>*>(m_sgmixture_neural_networks[m_train_count % 2]); ++m_train_count; std::vector<int>& input_layer_nodes = word2vector_neural_network->GetInputLayerNodes(); std::vector<int>& output_layer_nodes = word2vector_neural_network->GetOutputLayerNodes(); std::vector<int> local_input_layer_nodes, local_output_layer_nodes; assert(word2vector_neural_network->status == 2); if (m_trainer_id == 0) { multiverso::Log::Info("Rank %d input_layer_size=%d, output_layer_size=%d\n", m_process_id, input_layer_nodes.size(), output_layer_nodes.size()); } for (int i = m_trainer_id; i < input_layer_nodes.size(); i += m_option->thread_cnt) { local_input_layer_nodes.push_back(input_layer_nodes[i]); } for (int i = m_trainer_id; i < output_layer_nodes.size(); i += m_option->thread_cnt) { local_output_layer_nodes.push_back(output_layer_nodes[i]); } CopyParameterFromMultiverso(local_input_layer_nodes, local_output_layer_nodes, word2vector_neural_network); multiverso::Row<int64_t>& word_count_actual_row = GetRow<int64_t>(kWordCountActualTableId, 0); T learning_rate = m_option->init_learning_rate * (1 - word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1)); if (learning_rate < m_option->init_learning_rate * (real)0.0001) learning_rate = m_option->init_learning_rate * (real)0.0001; word2vector_neural_network->learning_rate = learning_rate; //Linearly increase the momentum from init_sense_prior_momentum to 1 word2vector_neural_network->sense_prior_momentum = m_option->init_sense_prior_momentum + (1 - m_option->init_sense_prior_momentum) * word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1); m_barrier->Wait(); for (int i = m_trainer_id; i < data->Size(); i += m_option->thread_cnt) //i iterates over all sentences { int sentence_length; int64_t word_count_deta; int *sentence; uint64_t next_random; data->Get(i, sentence, sentence_length, word_count_deta, next_random); word2vector_neural_network->Train(sentence, sentence_length, gamma, fTable, input_backup); m_word_count += word_count_deta; if (m_word_count - m_last_word_count > 10000) { multiverso::Row<int64_t>& word_count_actual_row = GetRow<int64_t>(kWordCountActualTableId, 0); Add<int64_t>(kWordCountActualTableId, 0, 0, m_word_count - m_last_word_count); m_last_word_count = m_word_count; m_now_time = clock(); if (m_trainer_id % 3 == 0) { multiverso::Log::Info("Rank %d Trainer %d lr: %.5f Mom: %.4f Progress: %.2f%% Words/thread/sec(total): %.2fk W/t/sec(executive): %.2fk\n", m_process_id, m_trainer_id, word2vector_neural_network->learning_rate, word2vector_neural_network->sense_prior_momentum, word_count_actual_row.At(0) / (real)(m_option->total_words * m_option->epoch + 1) * 100, m_last_word_count / ((real)(m_now_time - m_start_time + 1) / (real)CLOCKS_PER_SEC * 1000), m_last_word_count / ((real)(m_executive_time + clock() - train_interation_start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } T learning_rate = m_option->init_learning_rate * (1 - word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1)); if (learning_rate < m_option->init_learning_rate * (real)0.0001) learning_rate = m_option->init_learning_rate * (real)0.0001; word2vector_neural_network->learning_rate = learning_rate; word2vector_neural_network->sense_prior_momentum = m_option->init_sense_prior_momentum + (1 - m_option->init_sense_prior_momentum) * word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1); } } m_barrier->Wait(); AddParameterToMultiverso(local_input_layer_nodes, local_output_layer_nodes, word2vector_neural_network); m_executive_time += clock() - train_interation_start; multiverso::Log::Info("Rank %d Train %d end at %lfs, cost %lfs, total cost %lfs\n", m_process_id, m_trainer_id, clock() / (double)CLOCKS_PER_SEC, (clock() - train_interation_start) / (double)CLOCKS_PER_SEC, m_executive_time / (double)CLOCKS_PER_SEC); fflush(stdout); if (data->GetTables().size() > 0 && m_trainer_id == 0) //Dump model files { SaveMultiInputEmbedding(data->GetEpochId()); SaveOutputEmbedding(data->GetEpochId()); if (data->GetEpochId() == 0) SaveHuffEncoder(); fprintf(m_log_file, "%d %lf\t %lf\n", data->GetEpochId(), (clock() - m_start_time) / (double)CLOCKS_PER_SEC, m_executive_time / (double)CLOCKS_PER_SEC); } assert(word2vector_neural_network->status == 2); word2vector_neural_network->status = 0; multiverso::Log::Info("Rank %d Train %d are leaving training iter with nn status:%d\n", m_process_id, m_trainer_id, word2vector_neural_network->status); fflush(stdout); }