Esempi in C++ (Cpp) per Executor::Forward

Esempio n. 1

0

Mostra file

File: charRNN.cpp Progetto: Piyush3dB/mxnet

void predictWithBuiltInRNNOp(wstring* ptext, int sequence_length, const string param_file,
  const string dictionary_file) {
  Context device(DeviceType::kGPU, 0);
  auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
  auto dictionary = get<0>(results);
  auto charIndices = get<1>(results);
  input_dim = static_cast<int>(charIndices.size());
  auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);

  map<string, NDArray> args_map;
  args_map["data"] = NDArray(Shape(1, 1), device, false);
  args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
  vector<mx_float> zeros(1 * num_lstm_layer * num_hidden, 0);
  // Avoiding SwapAxis, batch_size=1 is of second dimension.
  args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
  args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
  args_map["LSTM_init_c"].SyncCopyFromCPU(zeros);
  args_map["LSTM_init_h"].SyncCopyFromCPU(zeros);
  Executor* exe = RNN.SimpleBind(device, args_map);
  LoadCheckpoint(param_file, exe);

  mx_float index;
  wchar_t next = 0;
  vector<mx_float> softmax;
  softmax.resize(input_dim);
  for (auto c : *ptext) {
    exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
    exe->Forward(false);

    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
    exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]);
    exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]);

    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
    index = (mx_float) n;
    next = charIndices[n];
  }
  ptext->push_back(next);

  for (int i = 0; i < sequence_length; i++) {
    exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1);
    exe->Forward(false);

    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
    exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]);
    exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]);

    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
    index = (mx_float) n;
    next = charIndices[n];
    ptext->push_back(next);
  }
}

Esempio n. 2

0

Mostra file

File: charRNN.cpp Progetto: Piyush3dB/mxnet

void trainWithBuiltInRNNOp(const string file, int batch_size, int max_epoch, int start_epoch) {
  Context device(DeviceType::kGPU, 0);
  BucketSentenceIter dataIter(file, batch_size, device);
  string prefix = file.substr(0, file.rfind("."));
  dataIter.saveCharIndices(prefix + ".dictionary");

  input_dim = static_cast<int>(dataIter.characterSize());
  sequence_length_max = dataIter.maxSequenceLength();

  auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
      num_embed, dropout);
  map<string, NDArray> args_map;
  args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
  // Avoiding SwapAxis, batch_size is of second dimension.
  args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
  args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
  args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
  vector<mx_float> zeros(batch_size * num_lstm_layer * num_hidden, 0);
  Executor* exe = RNN.SimpleBind(device, args_map);

  if (start_epoch == -1) {
    RNNXavier xavier = RNNXavier(Xavier::gaussian, Xavier::in, 2.34);
    for (auto &arg : exe->arg_dict())
      xavier(arg.first, &arg.second);
  } else {
    LoadCheckpoint(prefix + "-" + to_string(start_epoch) + ".params", exe);
  }
  start_epoch++;

  mx_float learning_rate = 0.0002;
  mx_float weight_decay = 0.000002;
  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
//  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
//  ->SetParam("clip_gradient", 10);

  for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
    dataIter.Reset();
    auto tic = chrono::system_clock::now();
    while (dataIter.Next()) {
      auto data_batch = dataIter.GetDataBatch();
      data_batch.data.CopyTo(&exe->arg_dict()["data"]);
      data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]);
      exe->arg_dict()["LSTM_init_c"].SyncCopyFromCPU(zeros);
      exe->arg_dict()["LSTM_init_h"].SyncCopyFromCPU(zeros);
      NDArray::WaitAll();

      exe->Forward(true);
      exe->Backward();
      exe->UpdateAll(opt, learning_rate, weight_decay);
      NDArray::WaitAll();
    }
    auto toc = chrono::system_clock::now();
    cout << "Epoch[" << epoch << "] Time Cost:" <<
        chrono::duration_cast<chrono::seconds>(toc - tic).count() << " seconds ";
    OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
    string filepath = prefix + "-" + to_string(epoch) + ".params";
    SaveCheckpoint(filepath, RNN, exe);
  }
}

Esempio n. 3

0

Mostra file

File: feature_extract.cpp Progetto: LynetteXing1991/MXNet.cpp

 void Extract(NDArray data) {
   /*Normalize the pictures*/
   data.Slice(0, 1) -= mean_img;
   data.Slice(1, 2) -= mean_img;
   args_map["data"] = data;
   /*bind the excutor*/
   executor = net.SimpleBind(global_ctx, args_map, map<string, NDArray>(),
                             map<string, OpReqType>(), aux_map);
   executor->Forward(false);
   /*print out the features*/
   auto array = executor->outputs[0].Copy(Context(kCPU, 0));
   NDArray::WaitAll();
   for (int i = 0; i < 1024; ++i) {
     cout << array.At(0, i) << ",";
   }
   cout << endl;
 }

Esempio n. 4

0

Mostra file

File: lenet.cpp Progetto: PL23K/MXNet-Learning-Note

	void Test(vector<float > &vec, int data_count)
	{
		args_map["data"] =
			NDArray(Shape(data_count, 1, W, H), ctx_dev, false);
		args_map["data_label"] =
			NDArray(Shape(data_count), ctx_dev, false);
		const float *dptr = vec.data();
		args_map["data"].SyncCopyFromCPU(dptr, data_count * W * H);;

		NDArray::WaitAll();


		Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
		exe->Forward(false);

		const auto &out = exe->outputs;
		NDArray out_cpu = out[0].Copy(ctx_cpu);

		NDArray::WaitAll();


		const mx_float *dptr_out = out_cpu.GetData();
		//cout << out_cpu.GetShape()[0];

		for (int i = 0; i < data_count; ++i) {

			int cat_num = out_cpu.GetShape()[1];
			float p_label = 0, max_p = dptr_out[i * cat_num];
			for (int j = 0; j < cat_num; ++j) {
				float p = dptr_out[i * cat_num + j];

				if (max_p < p) {
					p_label = j;
					max_p = p;
				}

			}
			cout << p_label << endl;

		}



		delete exe;
	}

Esempio n. 5

0

Mostra file

File: mlp.cpp Progetto: Johnqczhang/mxnet

void MLP() {
  auto sym_x = Symbol::Variable("X");
  auto sym_label = Symbol::Variable("label");

  const int nLayers = 2;
  vector<int> layerSizes({512, 10});
  vector<Symbol> weights(nLayers);
  vector<Symbol> biases(nLayers);
  vector<Symbol> outputs(nLayers);

  for (int i = 0; i < nLayers; i++) {
    string istr = to_string(i);
    weights[i] = Symbol::Variable(string("w") + istr);
    biases[i] = Symbol::Variable(string("b") + istr);
    Symbol fc = FullyConnected(string("fc") + istr,
      i == 0? sym_x : outputs[i-1],
      weights[i], biases[i], layerSizes[i]);
    outputs[i] = LeakyReLU(string("act") + istr, fc, LeakyReLUActType::leaky);
  }
  auto sym_out = SoftmaxOutput("softmax", outputs[nLayers - 1], sym_label);

  Context ctx_dev(DeviceType::kCPU, 0);

  NDArray array_x(Shape(128, 28), ctx_dev, false);
  NDArray array_y(Shape(128), ctx_dev, false);

  mx_float* aptr_x = new mx_float[128 * 28];
  mx_float* aptr_y = new mx_float[128];

  // we make the data by hand, in 10 classes, with some pattern
  for (int i = 0; i < 128; i++) {
    for (int j = 0; j < 28; j++) {
      aptr_x[i * 28 + j] = i % 10 * 1.0f;
    }
    aptr_y[i] = i % 10;
  }
  array_x.SyncCopyFromCPU(aptr_x, 128 * 28);
  array_x.WaitToRead();
  array_y.SyncCopyFromCPU(aptr_y, 128);
  array_y.WaitToRead();

  // init the parameters
  NDArray array_w_1(Shape(512, 28), ctx_dev, false);
  NDArray array_b_1(Shape(512), ctx_dev, false);
  NDArray array_w_2(Shape(10, 512), ctx_dev, false);
  NDArray array_b_2(Shape(10), ctx_dev, false);

  // the parameters should be initialized in some kind of distribution,
  // so it learns fast
  // but here just give a const value by hand
  array_w_1 = 0.5f;
  array_b_1 = 0.0f;
  array_w_2 = 0.5f;
  array_b_2 = 0.0f;

  // the grads
  NDArray array_w_1_g(Shape(512, 28), ctx_dev, false);
  NDArray array_b_1_g(Shape(512), ctx_dev, false);
  NDArray array_w_2_g(Shape(10, 512), ctx_dev, false);
  NDArray array_b_2_g(Shape(10), ctx_dev, false);

  // Bind the symolic network with the ndarray
  // all the input args
  std::vector<NDArray> in_args;
  in_args.push_back(array_x);
  in_args.push_back(array_w_1);
  in_args.push_back(array_b_1);
  in_args.push_back(array_w_2);
  in_args.push_back(array_b_2);
  in_args.push_back(array_y);
  // all the grads
  std::vector<NDArray> arg_grad_store;
  arg_grad_store.push_back(NDArray());  // we don't need the grad of the input
  arg_grad_store.push_back(array_w_1_g);
  arg_grad_store.push_back(array_b_1_g);
  arg_grad_store.push_back(array_w_2_g);
  arg_grad_store.push_back(array_b_2_g);
  arg_grad_store.push_back(
      NDArray());  // neither do we need the grad of the loss
  // how to handle the grad
  std::vector<OpReqType> grad_req_type;
  grad_req_type.push_back(kNullOp);
  grad_req_type.push_back(kWriteTo);
  grad_req_type.push_back(kWriteTo);
  grad_req_type.push_back(kWriteTo);
  grad_req_type.push_back(kWriteTo);
  grad_req_type.push_back(kNullOp);
  std::vector<NDArray> aux_states;

  cout << "make the Executor" << endl;
  Executor* exe = new Executor(sym_out, ctx_dev, in_args, arg_grad_store,
                               grad_req_type, aux_states);

  cout << "Training" << endl;
  int max_iters = 20000;
  mx_float learning_rate = 0.0001;
  for (int iter = 0; iter < max_iters; ++iter) {
    exe->Forward(true);

    if (iter % 100 == 0) {
      cout << "epoch " << iter << endl;
      std::vector<NDArray>& out = exe->outputs;
      float* cptr = new float[128 * 10];
      out[0].SyncCopyToCPU(cptr, 128 * 10);
      NDArray::WaitAll();
      OutputAccuracy(cptr, aptr_y);
      delete[] cptr;
    }

    // update the parameters
    exe->Backward();
    for (int i = 1; i < 5; ++i) {
      in_args[i] -= arg_grad_store[i] * learning_rate;
    }
    NDArray::WaitAll();
  }

  delete exe;
  delete[] aptr_x;
  delete[] aptr_y;
}

Esempio n. 6

0

Mostra file

File: lenet.cpp Progetto: Johnqczhang/mxnet

  void Run() {
    /*
     * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
     * "Gradient-based learning applied to document recognition."
     * Proceedings of the IEEE (1998)
     * */

    /*define the symbolic net*/
    Symbol data = Symbol::Variable("data");
    Symbol data_label = Symbol::Variable("data_label");
    Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
    Symbol conv2_w("conv2_w"), conv2_b("conv2_b");
    Symbol conv3_w("conv3_w"), conv3_b("conv3_b");
    Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
    Symbol fc2_w("fc2_w"), fc2_b("fc2_b");

    Symbol conv1 =
        Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20);
    Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::tanh);
    Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::max,
      false, false, PoolingPoolingConvention::valid, Shape(2, 2));

    Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b,
      Shape(5, 5), 50);
    Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::tanh);
    Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::max,
      false, false, PoolingPoolingConvention::valid, Shape(2, 2));

    Symbol conv3 = Convolution("conv3", pool2, conv3_w, conv3_b,
      Shape(2, 2), 500);
    Symbol tanh3 = Activation("tanh3", conv3, ActivationActType::tanh);
    Symbol pool3 = Pooling("pool3", tanh3, Shape(2, 2), PoolingPoolType::max,
      false, false, PoolingPoolingConvention::valid, Shape(1, 1));

    Symbol flatten = Flatten("flatten", pool3);
    Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500);
    Symbol tanh4 = Activation("tanh4", fc1, ActivationActType::tanh);
    Symbol fc2 = FullyConnected("fc2", tanh4, fc2_w, fc2_b, 10);

    Symbol lenet = SoftmaxOutput("softmax", fc2, data_label);

    for (auto s : lenet.ListArguments()) {
      LG << s;
    }

    /*setup basic configs*/
    int val_fold = 1;
    int W = 28;
    int H = 28;
    int batch_size = 42;
    int max_epoch = 100000;
    float learning_rate = 1e-4;
    float weight_decay = 1e-4;

    /*prepare the data*/
    vector<float> data_vec, label_vec;
    size_t data_count = GetData(&data_vec, &label_vec);
    const float *dptr = data_vec.data();
    const float *lptr = label_vec.data();
    NDArray data_array = NDArray(Shape(data_count, 1, W, H), ctx_cpu,
                                 false);  // store in main memory, and copy to
    // device memory while training
    NDArray label_array =
      NDArray(Shape(data_count), ctx_cpu,
                false);  // it's also ok if just store them all in device memory
    data_array.SyncCopyFromCPU(dptr, data_count * W * H);
    label_array.SyncCopyFromCPU(lptr, data_count);
    data_array.WaitToRead();
    label_array.WaitToRead();

    size_t train_num = data_count * (1 - val_fold / 10.0);
    train_data = data_array.Slice(0, train_num);
    train_label = label_array.Slice(0, train_num);
    val_data = data_array.Slice(train_num, data_count);
    val_label = label_array.Slice(train_num, data_count);

    LG << "here read fin";

    /*init some of the args*/
    // map<string, NDArray> args_map;
    args_map["data"] = data_array.Slice(0, batch_size).Copy(ctx_dev);
    args_map["data_label"] = label_array.Slice(0, batch_size).Copy(ctx_dev);
    NDArray::WaitAll();

    LG << "here slice fin";
    /*
     * we can also feed in some of the args other than the input all by
     * ourselves,
     * fc2-w , fc1-b for example:
     * */
    // args_map["fc2_w"] =
    // NDArray(mshadow::Shape2(500, 4 * 4 * 50), ctx_dev, false);
    // NDArray::SampleGaussian(0, 1, &args_map["fc2_w"]);
    // args_map["fc1_b"] = NDArray(mshadow::Shape1(10), ctx_dev, false);
    // args_map["fc1_b"] = 0;

    lenet.InferArgsMap(ctx_dev, &args_map, args_map);
    Optimizer* opt = OptimizerRegistry::Find("ccsgd");
    opt->SetParam("momentum", 0.9)
       ->SetParam("rescale_grad", 1.0)
       ->SetParam("clip_gradient", 10);

    for (int ITER = 0; ITER < max_epoch; ++ITER) {
      size_t start_index = 0;
      while (start_index < train_num) {
        if (start_index + batch_size > train_num) {
          start_index = train_num - batch_size;
        }
        args_map["data"] =
            train_data.Slice(start_index, start_index + batch_size)
                .Copy(ctx_dev);
        args_map["data_label"] =
            train_label.Slice(start_index, start_index + batch_size)
                .Copy(ctx_dev);
        start_index += batch_size;
        NDArray::WaitAll();

        Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
        exe->Forward(true);
        exe->Backward();
        exe->UpdateAll(opt, learning_rate, weight_decay);

        delete exe;
      }

      LG << "Iter " << ITER
         << ", accuracy: " << ValAccuracy(batch_size * 10, lenet);
    }
  }

Esempio n. 7

0

Mostra file

File: lenet.cpp Progetto: PL23K/MXNet-Learning-Note

	void Run() {
		/*
		* LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
		* "Gradient-based learning applied to document recognition."
		* Proceedings of the IEEE (1998)
		* */

		/*define the symbolic net*/
		

		for (auto s : lenet.ListArguments()) {
			LG << s;
		}

		/*setup basic configs*/
		int val_fold = 3;
	
		int batch_size = 20;
		int max_epoch = 50;
		float learning_rate = 4e-6;
		float weight_decay = 1e-5;

		/*prepare the data*/
		vector<float> data_vec, label_vec;
		//size_t data_count = GetData(&data_vec, &label_vec);
		size_t data_count = Getdata(data_vec, label_vec);
		const float *dptr = data_vec.data();
		const float *lptr = label_vec.data();
		NDArray data_array = NDArray(Shape(data_count, 1, W, H), ctx_cpu,
			false);  // store in main memory, and copy to
		// device memory while training
		NDArray label_array =
			NDArray(Shape(data_count), ctx_cpu,
			false);  // it's also ok if just store them all in device memory
		data_array.SyncCopyFromCPU(dptr, data_count * W * H);
		label_array.SyncCopyFromCPU(lptr, data_count);
		data_array.WaitToRead();
		label_array.WaitToRead();

		size_t train_num = data_count * (1 - val_fold / 10.0);
		train_data = data_array.Slice(0, train_num);
		train_label = label_array.Slice(0, train_num);
		val_data = data_array.Slice(train_num, data_count);
		val_label = label_array.Slice(train_num, data_count);

		LG << "here read fin";

		/*init some of the args*/
		// map<string, NDArray> args_map;
		args_map["data"] =
			NDArray(Shape(batch_size, 1, W, H), ctx_dev, false);

		args_map["data"] = data_array.Slice(0, batch_size).Copy(ctx_dev);
		//args_map["data_label"] = label_array.Slice(0, batch_size).Copy(ctx_dev);
		/*args_map["fc1_weight"] =
			NDArray(Shape(500, 4 * 4 * 50), ctx_dev, false);
		//SampleGaussian(0, 1, &args_map["fc1_weight"]);
		args_map["fc1_weight"] = 0.3;
		args_map["fc2_bias"] = NDArray(Shape(10), ctx_dev, false);
		args_map["fc2_bias"] = 0;*/
		NDArray::WaitAll();

		LG << "here slice fin";
		/*
		* we can also feed in some of the args other than the input all by
		* ourselves,
		* fc2-w , fc1-b for example:
		* */
		// args_map["fc2_w"] =
		// NDArray(mshadow::Shape2(500, 4 * 4 * 50), ctx_dev, false);
		// NDArray::SampleGaussian(0, 1, &args_map["fc2_w"]);
		// args_map["fc1_b"] = NDArray(mshadow::Shape1(10), ctx_dev, false);
		// args_map["fc1_b"] = 0;

		Optimizer opt("ccsgd", learning_rate, weight_decay);
		opt.SetParam("momentum", 0.9)
			.SetParam("rescale_grad", 1.0)
			.SetParam("clip_gradient", 10);

		for (int ITER = 0; ITER < max_epoch; ++ITER) {
			size_t start_index = 0;
			while (start_index < train_num) {
				if (start_index + batch_size > train_num) {
					start_index = train_num - batch_size;
				}
				args_map["data"] =
					train_data.Slice(start_index, start_index + batch_size)
					.Copy(ctx_dev);
				args_map["data_label"] =
					train_label.Slice(start_index, start_index + batch_size)
					.Copy(ctx_dev);
				start_index += batch_size;
				NDArray::WaitAll();

				Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
				exe->Forward(true);
				exe->Backward();
				exe->UpdateAll(&opt, learning_rate, weight_decay);

				delete exe;
			}

			LG << "Iter " << ITER
				<< ", accuracy: " << ValAccuracy(batch_size , lenet);
		}
	}

Esempio n. 8

0

Mostra file

File: charRNN.cpp Progetto: freddycct/mxnet

void predict(wstring* ptext, int sequence_length, const string param_file,
    const string dictionary_file) {
  Context device(DeviceType::kGPU, 0);
  auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
  auto dictionary = get<0>(results);
  auto charIndices = get<1>(results);
  input_dim = static_cast<int>(charIndices.size());
  auto RNN = LSTMUnroll(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);

  map<string, NDArray> args_map;
  args_map["data"] = NDArray(Shape(1, 1), device, false);
  args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
  vector<mx_float> zeros(1 * num_hidden, 0);
  for (int l = 0; l < num_lstm_layer; l++) {
    string key = "l" + to_string(l) + "_init_";
    args_map[key + "c"] = NDArray(Shape(1, num_hidden), device, false);
    args_map[key + "h"] = NDArray(Shape(1, num_hidden), device, false);
    args_map[key + "c"].SyncCopyFromCPU(zeros);
    args_map[key + "h"].SyncCopyFromCPU(zeros);
  }
  Executor* exe = RNN.SimpleBind(device, args_map);
  LoadCheckpoint(param_file, exe);

  mx_float index;
  wchar_t next;
  vector<mx_float> softmax;
  softmax.resize(input_dim);
  for (auto c : *ptext) {
    exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
    exe->Forward(false);

    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
    for (int l = 0; l < num_lstm_layer; l++) {
      string key = "l" + to_string(l) + "_init_";
      exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
      exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
    }

    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
    index = (mx_float) n;
    next = charIndices[n];
  }
  ptext->push_back(next);

  for (int i = 0; i < sequence_length; i++) {
    exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1);
    exe->Forward(false);

    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
    for (int l = 0; l < num_lstm_layer; l++) {
      string key = "l" + to_string(l) + "_init_";
      exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
      exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
    }

    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
    index = (mx_float) n;
    next = charIndices[n];
    ptext->push_back(next);
  }
}