 void SpatialConvolution::init(TorchData& input)  {
   if (input.type() != TorchDataType::TENSOR_DATA) {
     throw std::runtime_error("SpatialConvolution::init() - "
       "FloatTensor expected!");
   Tensor<float>& in = (Tensor<float>&)input;
   if (in.dim()[2] != feats_in_) {
     throw std::runtime_error("SpatialConvolution::init() - ERROR: "
       "incorrect number of input features!");
   if (output != NULL) {
     Int3 out_dim(in.dim());
     out_dim[0] = out_dim[0] - filt_width_ + 1;
     out_dim[1] = out_dim[1] - filt_height_ + 1;
     out_dim[2] = feats_out_;
     if (!Int3::equal(out_dim, ((Tensor<float>*)output)->dim())) {
       // Input dimension has changed!
   if (output == NULL) {
     Int3 out_dim(in.dim());
     out_dim[0] = out_dim[0] - filt_width_ + 1;
     out_dim[1] = out_dim[1] - filt_height_ + 1;
     out_dim[2] = feats_out_;
     output = new Tensor<float>(out_dim);
     //  ((Tensor<float>*)output)->dim(), local_worgroup_size);
 void Parallel::forwardProp(TorchData& input) {
   if (input.type() != TorchDataType::TABLE_DATA) {
     throw std::runtime_error("Parallel::forwardProp() - "
       "Table expected!");
   Table& in = (Table&)input;
   if (in.tableSize() != network_->size()) {
     throw std::runtime_error("Parallel::forwardProp() - ERROR: "
       "Table size does not match number of parallel stages!");
   for (uint32_t i = 0; i < network_->size(); i++) {
   initOutput();  // Init output just copies the pointers from the output
                  // of all the parallel stages and fills up a table with them
  void SpatialSubtractiveNormalization::init(TorchData& input)  {
    if (input.type() != TorchDataType::TENSOR_DATA) {
      throw std::runtime_error("SpatialSubtractiveNormalization::init() - "
        "FloatTensor expected!");
    Tensor<float>& in = (Tensor<float>&)input;

    if (in.dim() != 3) {
      throw std::runtime_error("SpatialDivisiveNormalization::init() - "
        "3D input is expected!");

    if (output != NULL) {
      if (!in.isSameSizeAs(*(Tensor<float>*)output)) {
        // Input dimension has changed!

    if (output == NULL) {
      output = new Tensor<float>(in.dim(), in.size());
      mean_pass1_ = new Tensor<float>(in.dim(), in.size());
      mean_pass2_ = new Tensor<float>(in.dim(), in.size());

    if (mean_coef_ == NULL) {
      uint32_t mean_coeff_size[2];
      mean_coeff_size[0] = TO_TENSOR_PTR(output)->size()[0];
      mean_coeff_size[1] = TO_TENSOR_PTR(output)->size()[1];
      mean_coef_ = new Tensor<float>(2, mean_coeff_size);

      float* mean_coef_cpu = new float[mean_coef_->nelems()];
      float* kernel_cpu = new float[kernel_->nelems()];
      bool onedim_kernel = kernel_->dim() == 1;

      // Filter an image of all 1 values to create the normalization constants
      // See norm_test.lua for proof that this works as well as:
      // https://github.com/andresy/torch/blob/master/extra/nn/SpatialSubtractiveNormalization.lua
      int32_t n_feats = TO_TENSOR_PTR(output)->size()[2];
      int32_t height = TO_TENSOR_PTR(output)->size()[1];
      int32_t width = TO_TENSOR_PTR(output)->size()[0];
      if (onedim_kernel) {
        // 1D case - The filter is seperable, but we'll just do the dumb 2D 
        // version since we only do this once on startup.  --> O(n * m)
        uint32_t kernel_size = kernel_->size()[0];
        int32_t filt_rad = (kernel_size - 1) / 2;
        for (int32_t v = 0; v < height; v++) {
          for (int32_t u = 0; u < width; u++) {
            float tmp = 0.0f;
            for (int32_t v_filt = -filt_rad; v_filt <= filt_rad; v_filt++) {
              for (int32_t u_filt = -filt_rad; u_filt <= filt_rad; u_filt++) {
                int32_t u_in = u + u_filt;
                int32_t v_in = v + v_filt;
                if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) {
                  // Pixel is inside --> We'll effectively clamp zeros elsewhere.
                  tmp += 
                    (kernel_cpu[v_filt + filt_rad] * kernel_cpu[u_filt + filt_rad]);
            mean_coef_cpu[v * width + u] = tmp / n_feats;
      } else {
        // 2D case
        int32_t kernel_size_u = kernel_->size()[0];
        int32_t kernel_size_v = kernel_->size()[1];
        int32_t filt_rad_u = (kernel_size_u - 1) / 2;
        int32_t filt_rad_v = (kernel_size_v - 1) / 2;
        for (int32_t v = 0; v < height; v++) {
          for (int32_t u = 0; u < width; u++) {
            float tmp = 0.0f;
            for (int32_t v_filt = -filt_rad_v; v_filt <= filt_rad_v; v_filt++) {
              for (int32_t u_filt = -filt_rad_u; u_filt <= filt_rad_u; u_filt++) {
                int32_t u_in = u + u_filt;
                int32_t v_in = v + v_filt;
                if (u_in >= 0 && u_in < width && v_in >= 0 && v_in < height) {
                  // Pixel is inside --> We'll effectively clamp zeros elsewhere.
                  tmp += 
                    kernel_cpu[(v_filt + filt_rad_v) * kernel_size_u + (u_filt + filt_rad_u)];
            mean_coef_cpu[v * width + u] = tmp / n_feats;
      delete[] mean_coef_cpu;
      delete[] kernel_cpu;
    if (mean_ == NULL) {
      uint32_t mean_coeff_size[2];
      mean_coeff_size[0] = TO_TENSOR_PTR(output)->size()[0];
      mean_coeff_size[1] = TO_TENSOR_PTR(output)->size()[1];
      mean_ = new Tensor<float>(2, mean_coeff_size);
  void JoinTable::init(TorchData& input) {
    if (input.type() != TorchDataType::TABLE_DATA) {
      throw std::runtime_error("JoinTable::forwardProp() - "
        "Table expected!");
    Table& in = (Table&)input;

    if (in.tableSize() == 0) {
      throw std::runtime_error("JoinTable::forwardProp() - "
        "Empty input Table!");

    // Check that it is a table of FloatTensors
    for (uint32_t i = 0; i < in.tableSize(); i++) {
      if (in(i)->type() != TENSOR_DATA) {
        throw std::runtime_error("JoinTable::forwardProp() - "
          "Table of float tensors expected!");

    uint32_t dim = TO_TENSOR_PTR(in(0))->dim();
    if (dim <= dimension_) {
      throw std::runtime_error("JoinTable::forwardProp() - "
        "Input is smaller than join dimension!");
    uint32_t jdim = dim - dimension_ - 1;  // dimension_=0 is the top dim

    // Make sure the dimensions OTHER than the join dimension are all the same
    for (uint32_t d = 0; d < dim; d++) {
      if (d != jdim) {
        for (uint32_t j = 1; j < in.tableSize(); j++) {
          if (TO_TENSOR_PTR(in(j))->size()[d] != TO_TENSOR_PTR(in(0))->size()[d]) {
            throw std::runtime_error("JoinTable::forwardProp() - "
              "Size mismatch!");
        if (output != NULL && TO_TENSOR_PTR(output)->size()[d] != 
          TO_TENSOR_PTR(in(0))->size()[d]) {

    uint32_t nelems_jdim = 0;
    for (uint32_t j = 1; j < in.tableSize(); j++) {
      nelems_jdim += TO_TENSOR_PTR(in(j))->size()[jdim];

    if (output != NULL &&
      TO_TENSOR_PTR(output)->size()[jdim] != nelems_jdim) {

    if (output == NULL) {
      uint32_t* size = new uint32_t[dim];
      memcpy(size, TO_TENSOR_PTR(in(0))->size(), sizeof(size[0]) * dim);
      size[dimension_] = nelems_jdim;
      output = new Tensor<float>(dim, size);