void trainWithBuiltInRNNOp(const string file, int batch_size, int max_epoch, int start_epoch) { Context device(DeviceType::kGPU, 0); BucketSentenceIter dataIter(file, batch_size, device); string prefix = file.substr(0, file.rfind(".")); dataIter.saveCharIndices(prefix + ".dictionary"); input_dim = static_cast<int>(dataIter.characterSize()); sequence_length_max = dataIter.maxSequenceLength(); auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, sequence_length_max, input_dim, num_hidden, num_embed, dropout); map<string, NDArray> args_map; args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false); // Avoiding SwapAxis, batch_size is of second dimension. args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false); args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false); args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false); vector<mx_float> zeros(batch_size * num_lstm_layer * num_hidden, 0); Executor* exe = RNN.SimpleBind(device, args_map); if (start_epoch == -1) { RNNXavier xavier = RNNXavier(Xavier::gaussian, Xavier::in, 2.34); for (auto &arg : exe->arg_dict()) xavier(arg.first, &arg.second); } else { LoadCheckpoint(prefix + "-" + to_string(start_epoch) + ".params", exe); } start_epoch++; mx_float learning_rate = 0.0002; mx_float weight_decay = 0.000002; Optimizer* opt = OptimizerRegistry::Find("ccsgd"); // opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size) // ->SetParam("clip_gradient", 10); for (int epoch = start_epoch; epoch < max_epoch; ++epoch) { dataIter.Reset(); auto tic = chrono::system_clock::now(); while (dataIter.Next()) { auto data_batch = dataIter.GetDataBatch(); data_batch.data.CopyTo(&exe->arg_dict()["data"]); data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]); exe->arg_dict()["LSTM_init_c"].SyncCopyFromCPU(zeros); exe->arg_dict()["LSTM_init_h"].SyncCopyFromCPU(zeros); NDArray::WaitAll(); exe->Forward(true); exe->Backward(); exe->UpdateAll(opt, learning_rate, weight_decay); NDArray::WaitAll(); } auto toc = chrono::system_clock::now(); cout << "Epoch[" << epoch << "] Time Cost:" << chrono::duration_cast<chrono::seconds>(toc - tic).count() << " seconds "; OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]); string filepath = prefix + "-" + to_string(epoch) + ".params"; SaveCheckpoint(filepath, RNN, exe); } }
void Run() { /* * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner. * "Gradient-based learning applied to document recognition." * Proceedings of the IEEE (1998) * */ /*define the symbolic net*/ Symbol data = Symbol::Variable("data"); Symbol data_label = Symbol::Variable("data_label"); Symbol conv1_w("conv1_w"), conv1_b("conv1_b"); Symbol conv2_w("conv2_w"), conv2_b("conv2_b"); Symbol conv3_w("conv3_w"), conv3_b("conv3_b"); Symbol fc1_w("fc1_w"), fc1_b("fc1_b"); Symbol fc2_w("fc2_w"), fc2_b("fc2_b"); Symbol conv1 = Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20); Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::tanh); Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::max, false, false, PoolingPoolingConvention::valid, Shape(2, 2)); Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b, Shape(5, 5), 50); Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::tanh); Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::max, false, false, PoolingPoolingConvention::valid, Shape(2, 2)); Symbol conv3 = Convolution("conv3", pool2, conv3_w, conv3_b, Shape(2, 2), 500); Symbol tanh3 = Activation("tanh3", conv3, ActivationActType::tanh); Symbol pool3 = Pooling("pool3", tanh3, Shape(2, 2), PoolingPoolType::max, false, false, PoolingPoolingConvention::valid, Shape(1, 1)); Symbol flatten = Flatten("flatten", pool3); Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500); Symbol tanh4 = Activation("tanh4", fc1, ActivationActType::tanh); Symbol fc2 = FullyConnected("fc2", tanh4, fc2_w, fc2_b, 10); Symbol lenet = SoftmaxOutput("softmax", fc2, data_label); for (auto s : lenet.ListArguments()) { LG << s; } /*setup basic configs*/ int val_fold = 1; int W = 28; int H = 28; int batch_size = 42; int max_epoch = 100000; float learning_rate = 1e-4; float weight_decay = 1e-4; /*prepare the data*/ vector<float> data_vec, label_vec; size_t data_count = GetData(&data_vec, &label_vec); const float *dptr = data_vec.data(); const float *lptr = label_vec.data(); NDArray data_array = NDArray(Shape(data_count, 1, W, H), ctx_cpu, false); // store in main memory, and copy to // device memory while training NDArray label_array = NDArray(Shape(data_count), ctx_cpu, false); // it's also ok if just store them all in device memory data_array.SyncCopyFromCPU(dptr, data_count * W * H); label_array.SyncCopyFromCPU(lptr, data_count); data_array.WaitToRead(); label_array.WaitToRead(); size_t train_num = data_count * (1 - val_fold / 10.0); train_data = data_array.Slice(0, train_num); train_label = label_array.Slice(0, train_num); val_data = data_array.Slice(train_num, data_count); val_label = label_array.Slice(train_num, data_count); LG << "here read fin"; /*init some of the args*/ // map<string, NDArray> args_map; args_map["data"] = data_array.Slice(0, batch_size).Copy(ctx_dev); args_map["data_label"] = label_array.Slice(0, batch_size).Copy(ctx_dev); NDArray::WaitAll(); LG << "here slice fin"; /* * we can also feed in some of the args other than the input all by * ourselves, * fc2-w , fc1-b for example: * */ // args_map["fc2_w"] = // NDArray(mshadow::Shape2(500, 4 * 4 * 50), ctx_dev, false); // NDArray::SampleGaussian(0, 1, &args_map["fc2_w"]); // args_map["fc1_b"] = NDArray(mshadow::Shape1(10), ctx_dev, false); // args_map["fc1_b"] = 0; lenet.InferArgsMap(ctx_dev, &args_map, args_map); Optimizer* opt = OptimizerRegistry::Find("ccsgd"); opt->SetParam("momentum", 0.9) ->SetParam("rescale_grad", 1.0) ->SetParam("clip_gradient", 10); for (int ITER = 0; ITER < max_epoch; ++ITER) { size_t start_index = 0; while (start_index < train_num) { if (start_index + batch_size > train_num) { start_index = train_num - batch_size; } args_map["data"] = train_data.Slice(start_index, start_index + batch_size) .Copy(ctx_dev); args_map["data_label"] = train_label.Slice(start_index, start_index + batch_size) .Copy(ctx_dev); start_index += batch_size; NDArray::WaitAll(); Executor *exe = lenet.SimpleBind(ctx_dev, args_map); exe->Forward(true); exe->Backward(); exe->UpdateAll(opt, learning_rate, weight_decay); delete exe; } LG << "Iter " << ITER << ", accuracy: " << ValAccuracy(batch_size * 10, lenet); } }
void Run() { /* * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner. * "Gradient-based learning applied to document recognition." * Proceedings of the IEEE (1998) * */ /*define the symbolic net*/ for (auto s : lenet.ListArguments()) { LG << s; } /*setup basic configs*/ int val_fold = 3; int batch_size = 20; int max_epoch = 50; float learning_rate = 4e-6; float weight_decay = 1e-5; /*prepare the data*/ vector<float> data_vec, label_vec; //size_t data_count = GetData(&data_vec, &label_vec); size_t data_count = Getdata(data_vec, label_vec); const float *dptr = data_vec.data(); const float *lptr = label_vec.data(); NDArray data_array = NDArray(Shape(data_count, 1, W, H), ctx_cpu, false); // store in main memory, and copy to // device memory while training NDArray label_array = NDArray(Shape(data_count), ctx_cpu, false); // it's also ok if just store them all in device memory data_array.SyncCopyFromCPU(dptr, data_count * W * H); label_array.SyncCopyFromCPU(lptr, data_count); data_array.WaitToRead(); label_array.WaitToRead(); size_t train_num = data_count * (1 - val_fold / 10.0); train_data = data_array.Slice(0, train_num); train_label = label_array.Slice(0, train_num); val_data = data_array.Slice(train_num, data_count); val_label = label_array.Slice(train_num, data_count); LG << "here read fin"; /*init some of the args*/ // map<string, NDArray> args_map; args_map["data"] = NDArray(Shape(batch_size, 1, W, H), ctx_dev, false); args_map["data"] = data_array.Slice(0, batch_size).Copy(ctx_dev); //args_map["data_label"] = label_array.Slice(0, batch_size).Copy(ctx_dev); /*args_map["fc1_weight"] = NDArray(Shape(500, 4 * 4 * 50), ctx_dev, false); //SampleGaussian(0, 1, &args_map["fc1_weight"]); args_map["fc1_weight"] = 0.3; args_map["fc2_bias"] = NDArray(Shape(10), ctx_dev, false); args_map["fc2_bias"] = 0;*/ NDArray::WaitAll(); LG << "here slice fin"; /* * we can also feed in some of the args other than the input all by * ourselves, * fc2-w , fc1-b for example: * */ // args_map["fc2_w"] = // NDArray(mshadow::Shape2(500, 4 * 4 * 50), ctx_dev, false); // NDArray::SampleGaussian(0, 1, &args_map["fc2_w"]); // args_map["fc1_b"] = NDArray(mshadow::Shape1(10), ctx_dev, false); // args_map["fc1_b"] = 0; Optimizer opt("ccsgd", learning_rate, weight_decay); opt.SetParam("momentum", 0.9) .SetParam("rescale_grad", 1.0) .SetParam("clip_gradient", 10); for (int ITER = 0; ITER < max_epoch; ++ITER) { size_t start_index = 0; while (start_index < train_num) { if (start_index + batch_size > train_num) { start_index = train_num - batch_size; } args_map["data"] = train_data.Slice(start_index, start_index + batch_size) .Copy(ctx_dev); args_map["data_label"] = train_label.Slice(start_index, start_index + batch_size) .Copy(ctx_dev); start_index += batch_size; NDArray::WaitAll(); Executor *exe = lenet.SimpleBind(ctx_dev, args_map); exe->Forward(true); exe->Backward(); exe->UpdateAll(&opt, learning_rate, weight_decay); delete exe; } LG << "Iter " << ITER << ", accuracy: " << ValAccuracy(batch_size , lenet); } }