void ParameterLoader<T>::ParseAndRequest(multiverso::DataBlockBase *data_block) { if (m_parse_and_request_count == 0) { m_start_time = clock(); } fprintf(m_log_file, "%lf\n", (clock() - m_start_time) / (double)CLOCKS_PER_SEC); multiverso::Log::Info("Rank %d ParameterLoader begin %d\n", multiverso::Multiverso::ProcessRank(), m_parse_and_request_count); DataBlock *data = reinterpret_cast<DataBlock*>(data_block); SkipGramMixtureNeuralNetwork<T>* sg_mixture_neural_network = reinterpret_cast<SkipGramMixtureNeuralNetwork<T>*>(m_sgmixture_neural_networks[m_parse_and_request_count % 2]); ++m_parse_and_request_count; data->UpdateNextRandom(); sg_mixture_neural_network->PrepareParmeter(data); std::vector<int>& input_layer_nodes = sg_mixture_neural_network->GetInputLayerNodes(); std::vector<int>& output_layer_nodes = sg_mixture_neural_network->GetOutputLayerNodes(); assert(sg_mixture_neural_network->status == 0); sg_mixture_neural_network->status = 1; for (int i = 0; i < input_layer_nodes.size(); ++i) { int word_id = input_layer_nodes[i]; for (int j = 0; j < m_words_sense_info->word_sense_cnts_info[word_id]; ++j) RequestRow(kInputEmbeddingTableId, m_words_sense_info->p_input_embedding[word_id] + j); } for (int i = 0; i < output_layer_nodes.size(); ++i) RequestRow(kEmbeddingOutputTableId, output_layer_nodes[i]); RequestRow(kWordCountActualTableId, 0); for (int i = 0; i < input_layer_nodes.size(); ++i) { int word_id = input_layer_nodes[i]; if (m_words_sense_info->word_sense_cnts_info[word_id] > 1) RequestRow(kWordSensePriorTableId, m_words_sense_info->p_wordidx2sense_idx[word_id]); } std::vector<int> & tables = data->GetTables(); for (int i = 0; i < tables.size(); ++i) RequestTable(tables[i]); multiverso::Log::Info("Rank %d ParameterLoader finish %d\n", multiverso::Multiverso::ProcessRank(), m_parse_and_request_count - 1); fprintf(m_log_file, "%lf\n", (clock() - m_start_time) / (double)CLOCKS_PER_SEC); assert(sg_mixture_neural_network->status == 1); sg_mixture_neural_network->status = 2; }
//Train one datablock void Trainer<T>::TrainIteration(multiverso::DataBlockBase *data_block) { if (m_train_count == 0) { m_start_time = clock(); m_process_id = multiverso::Multiverso::ProcessRank(); } printf("Rank %d Begin TrainIteration...%d\n", m_process_id, m_train_count); clock_t train_interation_start = clock(); fflush(stdout); m_process_count = multiverso::Multiverso::TotalProcessCount(); DataBlock *data = reinterpret_cast<DataBlock*>(data_block); SkipGramMixtureNeuralNetwork<T>* word2vector_neural_network = reinterpret_cast<SkipGramMixtureNeuralNetwork<T>*>(m_sgmixture_neural_networks[m_train_count % 2]); ++m_train_count; std::vector<int>& input_layer_nodes = word2vector_neural_network->GetInputLayerNodes(); std::vector<int>& output_layer_nodes = word2vector_neural_network->GetOutputLayerNodes(); std::vector<int> local_input_layer_nodes, local_output_layer_nodes; assert(word2vector_neural_network->status == 2); if (m_trainer_id == 0) { multiverso::Log::Info("Rank %d input_layer_size=%d, output_layer_size=%d\n", m_process_id, input_layer_nodes.size(), output_layer_nodes.size()); } for (int i = m_trainer_id; i < input_layer_nodes.size(); i += m_option->thread_cnt) { local_input_layer_nodes.push_back(input_layer_nodes[i]); } for (int i = m_trainer_id; i < output_layer_nodes.size(); i += m_option->thread_cnt) { local_output_layer_nodes.push_back(output_layer_nodes[i]); } CopyParameterFromMultiverso(local_input_layer_nodes, local_output_layer_nodes, word2vector_neural_network); multiverso::Row<int64_t>& word_count_actual_row = GetRow<int64_t>(kWordCountActualTableId, 0); T learning_rate = m_option->init_learning_rate * (1 - word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1)); if (learning_rate < m_option->init_learning_rate * (real)0.0001) learning_rate = m_option->init_learning_rate * (real)0.0001; word2vector_neural_network->learning_rate = learning_rate; //Linearly increase the momentum from init_sense_prior_momentum to 1 word2vector_neural_network->sense_prior_momentum = m_option->init_sense_prior_momentum + (1 - m_option->init_sense_prior_momentum) * word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1); m_barrier->Wait(); for (int i = m_trainer_id; i < data->Size(); i += m_option->thread_cnt) //i iterates over all sentences { int sentence_length; int64_t word_count_deta; int *sentence; uint64_t next_random; data->Get(i, sentence, sentence_length, word_count_deta, next_random); word2vector_neural_network->Train(sentence, sentence_length, gamma, fTable, input_backup); m_word_count += word_count_deta; if (m_word_count - m_last_word_count > 10000) { multiverso::Row<int64_t>& word_count_actual_row = GetRow<int64_t>(kWordCountActualTableId, 0); Add<int64_t>(kWordCountActualTableId, 0, 0, m_word_count - m_last_word_count); m_last_word_count = m_word_count; m_now_time = clock(); if (m_trainer_id % 3 == 0) { multiverso::Log::Info("Rank %d Trainer %d lr: %.5f Mom: %.4f Progress: %.2f%% Words/thread/sec(total): %.2fk W/t/sec(executive): %.2fk\n", m_process_id, m_trainer_id, word2vector_neural_network->learning_rate, word2vector_neural_network->sense_prior_momentum, word_count_actual_row.At(0) / (real)(m_option->total_words * m_option->epoch + 1) * 100, m_last_word_count / ((real)(m_now_time - m_start_time + 1) / (real)CLOCKS_PER_SEC * 1000), m_last_word_count / ((real)(m_executive_time + clock() - train_interation_start + 1) / (real)CLOCKS_PER_SEC * 1000)); fflush(stdout); } T learning_rate = m_option->init_learning_rate * (1 - word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1)); if (learning_rate < m_option->init_learning_rate * (real)0.0001) learning_rate = m_option->init_learning_rate * (real)0.0001; word2vector_neural_network->learning_rate = learning_rate; word2vector_neural_network->sense_prior_momentum = m_option->init_sense_prior_momentum + (1 - m_option->init_sense_prior_momentum) * word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1); } } m_barrier->Wait(); AddParameterToMultiverso(local_input_layer_nodes, local_output_layer_nodes, word2vector_neural_network); m_executive_time += clock() - train_interation_start; multiverso::Log::Info("Rank %d Train %d end at %lfs, cost %lfs, total cost %lfs\n", m_process_id, m_trainer_id, clock() / (double)CLOCKS_PER_SEC, (clock() - train_interation_start) / (double)CLOCKS_PER_SEC, m_executive_time / (double)CLOCKS_PER_SEC); fflush(stdout); if (data->GetTables().size() > 0 && m_trainer_id == 0) //Dump model files { SaveMultiInputEmbedding(data->GetEpochId()); SaveOutputEmbedding(data->GetEpochId()); if (data->GetEpochId() == 0) SaveHuffEncoder(); fprintf(m_log_file, "%d %lf\t %lf\n", data->GetEpochId(), (clock() - m_start_time) / (double)CLOCKS_PER_SEC, m_executive_time / (double)CLOCKS_PER_SEC); } assert(word2vector_neural_network->status == 2); word2vector_neural_network->status = 0; multiverso::Log::Info("Rank %d Train %d are leaving training iter with nn status:%d\n", m_process_id, m_trainer_id, word2vector_neural_network->status); fflush(stdout); }