Esempio n. 1
0
        void Trainer::TrainIteration(multiverso::DataBlockBase *data_block)
        {
			if (process_id_ == -1)
                process_id_ = multiverso::Multiverso::ProcessRank();

            if (trainer_id_ == 0)
                //Record the starting time of the Trainiteration  
            {
                fprintf(log_file_, "%lf\n", (clock()) / (double)CLOCKS_PER_SEC);
                fflush(log_file_);
            }


            multiverso::Log::Info("Rank %d [Trainer]------Train %d Begin TrainIteration%d ...\n",
                process_id_, trainer_id_, train_count_);
            ++train_count_;
            //Compute the total number of processes
            if (process_count_ == -1)
                process_count_ = multiverso::Multiverso::TotalProcessCount();
          

            DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
            std::vector<int> input_nodes(data->input_nodes.begin(), data->input_nodes.end());
			std::vector<int> output_nodes(data->output_nodes.begin(), data->output_nodes.end());
			
            //A trainer only copy or add apart of parameters
            //This trainer should copy or add the parameters according to
            //local_input_nodes and local_output_nodes 
            std::vector<int> local_input_nodes;
            std::vector<int> local_output_nodes;
			

            for (int i = trainer_id_; i < input_nodes.size(); i += option_->thread_cnt)
                local_input_nodes.push_back(input_nodes[i]);
            for (int i = trainer_id_; i < output_nodes.size(); i += option_->thread_cnt)
                local_output_nodes.push_back(output_nodes[i]);
		
            if (trainer_id_ == 0)
            {
                multiverso::Log::Info("Rank %d [Trainer]------ input_size=%d, output_size=%d, negativesample_size=%d \n",
                    process_id_, input_nodes.size(), output_nodes.size(), data->negativesample_pools.size());
            }

            //Step 1, Copy the parameter from multiverso to WordEmbedding_
            //One trainer only copy a part of parameters
            multiverso::Log::Info("Rank %d [Trainer]------Train %d Copyparameter Begin TrainIteration%d ...\n",
                process_id_, trainer_id_, train_count_);
			
            CopyParameter(local_input_nodes, local_output_nodes);
            if (trainer_id_ == 0)
            {
                multiverso::Row<int64> &copy_row = GetRow<int64>(kWordCountActualTableId, 0);
                WordEmbedding_->word_count_actual = copy_row.At(0);
                WordEmbedding_->UpdateLearningRate();
            }
            multiverso::Log::Info("Rank %d [Trainer]------Train %d Copyparameter end TrainIteration%d ...\n",
                process_id_, trainer_id_, train_count_);
            //Wait for all the trainers to finish copying parameter
            barrier_->Wait();
		
            //Step 2, After finishing copying parameter,
            //Use WordEmbedding_ to train a part of data_block
            int64 last_word_count = word_count;
            clock_t start = clock();
            multiverso::Log::Info("Rank %d [Trainer]------Train %d TrainNN Begin TrainIteration%d ...\n",
                process_id_, trainer_id_, train_count_);
            if (trainer_id_ == 0)
			    multiverso::Log::Info("Rank %d [Trainer]------Train %d Datablock's sentence number:%d ...\n",
				    process_id_, trainer_id_, data->Size());
            WordEmbedding_->Train(data, trainer_id_, option_->thread_cnt,
                word_count, hidden_act_, hidden_err_);
            if (word_count > last_word_count)
            {
                multiverso::Log::Info("[Trainer]------TrainNNSpeed: Words/thread/second %lfk\n",
                    ((double)word_count - last_word_count) / 
                    (clock() - start) * (double)CLOCKS_PER_SEC / 1000);
            }
            multiverso::Log::Info("Rank %d [Trainer]------Train %d TrainNN end TrainIteration%d ...\n",
                process_id_, trainer_id_, train_count_);
            //Wait for all the trainers to finish training
            barrier_->Wait();
            multiverso::Log::Info("Rank %d [Trainer]------Train %d AddDeltaParameter Begin TrainIteration%d ...\n",
                process_id_, trainer_id_, train_count_);
            //Step 3, After finishing training, add the delta of parameters to multiverso
            AddDeltaParameter(local_input_nodes, local_output_nodes);
            if (trainer_id_ == 0)
            {
                multiverso::Row<int64> &copy_row = GetRow<int64>(kWordCountActualTableId, 0);
                Add<int64>(kWordCountActualTableId, 0, 0, WordEmbedding_->word_count_actual - copy_row.At(0));
            }
            multiverso::Log::Info("Rank %d [Trainer]------Train %d AddDeltaParameter end TrainIteration%d ...\n",
                process_id_, trainer_id_, train_count_);

			//If the data_block is the last one,Dump the input-embedding weights 
			if (data->Type() == DataBlockType::Test && trainer_id_ == 0)
			{
				SaveEmbedding(option_->output_file, option_->output_binary);
			}

			if (data->Type() == DataBlockType::Test && process_id_ == 0 && trainer_id_ == 0)
			{
				SaveEmbedding("tmp.bin", 1);
				char s[128] = { 0 };
				sprintf_s(s, "check.py tmp.bin %d >> records.txt", clock() / CLOCKS_PER_SEC);
				system(s);
			}

            if (trainer_id_ == 0)
            {
                fprintf(log_file_, "%lf\n",
                    (clock()) / (double)CLOCKS_PER_SEC);
                fflush(log_file_);
            }
        }
//Train one datablock
void Trainer<T>::TrainIteration(multiverso::DataBlockBase *data_block)
{
	if (m_train_count == 0)
	{
		m_start_time = clock();
		m_process_id = multiverso::Multiverso::ProcessRank();
	}

	printf("Rank %d Begin TrainIteration...%d\n", m_process_id, m_train_count);
	clock_t train_interation_start = clock();
	fflush(stdout);

	m_process_count = multiverso::Multiverso::TotalProcessCount();

	DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
	SkipGramMixtureNeuralNetwork<T>* word2vector_neural_network = reinterpret_cast<SkipGramMixtureNeuralNetwork<T>*>(m_sgmixture_neural_networks[m_train_count % 2]);
	++m_train_count;
	std::vector<int>& input_layer_nodes = word2vector_neural_network->GetInputLayerNodes();
	std::vector<int>& output_layer_nodes = word2vector_neural_network->GetOutputLayerNodes();
	std::vector<int> local_input_layer_nodes, local_output_layer_nodes;
	assert(word2vector_neural_network->status == 2);
	if (m_trainer_id == 0)
	{
		multiverso::Log::Info("Rank %d input_layer_size=%d, output_layer_size=%d\n", m_process_id, input_layer_nodes.size(), output_layer_nodes.size());
	}

	for (int i = m_trainer_id; i < input_layer_nodes.size(); i += m_option->thread_cnt)
	{
		local_input_layer_nodes.push_back(input_layer_nodes[i]);
	}

	for (int i = m_trainer_id; i < output_layer_nodes.size(); i += m_option->thread_cnt)
	{
		local_output_layer_nodes.push_back(output_layer_nodes[i]);
	}

	CopyParameterFromMultiverso(local_input_layer_nodes, local_output_layer_nodes, word2vector_neural_network);

	multiverso::Row<int64_t>& word_count_actual_row = GetRow<int64_t>(kWordCountActualTableId, 0);
	T learning_rate = m_option->init_learning_rate * (1 - word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1));
	if (learning_rate < m_option->init_learning_rate * (real)0.0001)
		learning_rate = m_option->init_learning_rate * (real)0.0001;
	word2vector_neural_network->learning_rate = learning_rate;

	//Linearly increase the momentum from init_sense_prior_momentum to 1
	word2vector_neural_network->sense_prior_momentum = m_option->init_sense_prior_momentum +
		(1 - m_option->init_sense_prior_momentum) * word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1);

	m_barrier->Wait();

	for (int i = m_trainer_id; i < data->Size(); i += m_option->thread_cnt)  //i iterates over all sentences
	{
		int sentence_length;
		int64_t word_count_deta;
		int *sentence;
		uint64_t next_random;
		data->Get(i, sentence, sentence_length, word_count_deta, next_random);

		word2vector_neural_network->Train(sentence, sentence_length, gamma, fTable, input_backup);

		m_word_count += word_count_deta;
		if (m_word_count - m_last_word_count > 10000)
		{
			multiverso::Row<int64_t>& word_count_actual_row = GetRow<int64_t>(kWordCountActualTableId, 0);
			Add<int64_t>(kWordCountActualTableId, 0, 0, m_word_count - m_last_word_count);
			m_last_word_count = m_word_count;
			m_now_time = clock();

			if (m_trainer_id % 3 == 0)
			{
				multiverso::Log::Info("Rank %d Trainer %d lr: %.5f Mom: %.4f Progress: %.2f%% Words/thread/sec(total): %.2fk  W/t/sec(executive): %.2fk\n",
					m_process_id, m_trainer_id,
					word2vector_neural_network->learning_rate, word2vector_neural_network->sense_prior_momentum,
					word_count_actual_row.At(0) / (real)(m_option->total_words * m_option->epoch + 1) * 100,
					m_last_word_count / ((real)(m_now_time - m_start_time + 1) / (real)CLOCKS_PER_SEC * 1000),
					m_last_word_count / ((real)(m_executive_time + clock() - train_interation_start + 1) / (real)CLOCKS_PER_SEC * 1000));

				fflush(stdout);
			}

			T learning_rate = m_option->init_learning_rate * (1 - word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1));
			if (learning_rate < m_option->init_learning_rate * (real)0.0001)
				learning_rate = m_option->init_learning_rate * (real)0.0001;
			word2vector_neural_network->learning_rate = learning_rate;

			word2vector_neural_network->sense_prior_momentum = m_option->init_sense_prior_momentum + (1 - m_option->init_sense_prior_momentum) * word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1);
		}
	}

	m_barrier->Wait();
	AddParameterToMultiverso(local_input_layer_nodes, local_output_layer_nodes, word2vector_neural_network);

	m_executive_time += clock() - train_interation_start;

	multiverso::Log::Info("Rank %d Train %d end at %lfs, cost %lfs, total cost %lfs\n",
		m_process_id,
		m_trainer_id, clock() / (double)CLOCKS_PER_SEC,
		(clock() - train_interation_start) / (double)CLOCKS_PER_SEC,
		m_executive_time / (double)CLOCKS_PER_SEC);
	fflush(stdout);

	if (data->GetTables().size() > 0 && m_trainer_id == 0) //Dump model files
	{
		SaveMultiInputEmbedding(data->GetEpochId());
		SaveOutputEmbedding(data->GetEpochId());
		if (data->GetEpochId() == 0)
			SaveHuffEncoder();

		fprintf(m_log_file, "%d %lf\t %lf\n", data->GetEpochId(), (clock() - m_start_time) / (double)CLOCKS_PER_SEC, m_executive_time / (double)CLOCKS_PER_SEC);
	}

	assert(word2vector_neural_network->status == 2);

	word2vector_neural_network->status = 0;

	multiverso::Log::Info("Rank %d Train %d are leaving training iter with nn status:%d\n", m_process_id, m_trainer_id, word2vector_neural_network->status);
	fflush(stdout);
}