void DeConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs) { Blob &wghtBlob = blobs[0]; for (size_t ii = 0; ii < outputs.size(); ii++) { Blob &convBlob = *inputs[ii]; Blob &decnBlob = outputs[ii]; for (int n = 0; n < convBlob.num(); n++) { for (int g = 0; g < group; g++) { Mat dstMat(inpGroupCn, inpH*inpW, decnBlob.type(), decnBlob.ptr(n, g*inpGroupCn)); if (is1x1()) colMat = dstMat; Mat convMat(outGroupCn, outH*outW, convBlob.type(), convBlob.ptr(n, g*outGroupCn)); Mat wghtMat(outGroupCn, ksize, wghtBlob.type(), wghtBlob.ptr(g*outGroupCn)); gemmCPU(wghtMat, convMat, 1, colMat, 0, GEMM_1_T); col2im(dstMat); if (bias) { float *biasPtr = blobs[1].ptrf() + g*inpGroupCn; Mat biasMat(inpGroupCn, 1, CV_32F, biasPtr); gemmCPU(biasMat, biasOnesMat, 1, dstMat, 1); //TODO: gemv } } } } }
void col2im_double(const double *col, double *img, int width, int height, int channels, int kernel_w, int kernel_h, int pad_w, int pad_h, int stride_w, int stride_h) { col2im(col, img, width, height, channels, kernel_w, kernel_h, pad_w, pad_h, stride_w, stride_h); }
void col2im_float(const float *col, float *img, int width, int height, int channels, int kernel_w, int kernel_h, int pad_w, int pad_h, int stride_w, int stride_h) { col2im(col, img, width, height, channels, kernel_w, kernel_h, pad_w, pad_h, stride_w, stride_h); }
bool Run(Node *node) // { //input const Tensor *input_tensor = node->GetInputTensor(0); float *input = (float *)get_tensor_mem(input_tensor); const TShape &in_shape = input_tensor->GetShape(); const std::vector<int> in_dims = in_shape.GetDim(); //output Tensor *output_tensor = node->GetOutputTensor(0); float *output = (float *)get_tensor_mem(output_tensor); const TShape &out_shape = output_tensor->GetShape(); const std::vector<int> out_dims = out_shape.GetDim(); //weight const Tensor *weight_tensor = node->GetInputTensor(1); float *weight = (float *)get_tensor_mem(weight_tensor); //bias const Tensor *bias_tensor = node->GetInputTensor(2); float *bias = (float *)get_tensor_mem(bias_tensor); //param Deconvolution *deconv_op = dynamic_cast<Deconvolution *>(node->GetOp()); DeconvParam *param_ = deconv_op->GetParam(); int pad = param_->pad; int stride = param_->stride; int ksize = param_->kernel_size; int dilation = param_->dilation; //buffer float * buffer = any_cast<float *>(node->GetAttr("buffer")); //shape int batch = in_dims[0]; int chw_in = in_dims[1]*in_dims[2]*in_dims[3]; int c_in = in_dims[1]; int h_in = in_dims[2]; int w_in = in_dims[3]; int c_out= out_dims[1]; int h_out= out_dims[2]; int w_out= out_dims[3]; int chw_out = c_out * h_out * w_out; int hw_out= out_dims[2]* out_dims[3]; int out_size=out_dims[0]*chw_out; memset(output,0,out_size*sizeof(float)); int m = ksize* ksize * c_out; int n = h_in * w_in; int k = c_in; for(int b = 0; b < batch; ++b) { float *inp = input + b*chw_in; float *out_ptr = output + b*chw_out; cblas_sgemm(CblasRowMajor, CblasTrans, CblasNoTrans, m, n, k, 1, weight, m, inp, n, 0, buffer, n); col2im(buffer,out_ptr, c_out, h_out, w_out, ksize, stride, pad,dilation,h_in,w_in); add_bias(out_ptr, bias, c_out, hw_out); } return true; }
// Theano op code // Authors: Arjun Jain, Frederic Bastien, Jan Schluter // Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu // and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom, PyGpuArrayObject *const weight, PyGpuArrayObject *const top, const size_t direction, const size_t dH = 1, const size_t dW = 1, const size_t dilH = 1, const size_t dilW = 1, const size_t padH = 0, const size_t padW = 0) { if (PyGpuArray_NDIM(bottom) != 4) { PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorrMM requires bottom to be C-contiguous, " "but strides are: %ld %ld %ld %ld\n", PyGpuArray_STRIDES(bottom)[0], PyGpuArray_STRIDES(bottom)[1], PyGpuArray_STRIDES(bottom)[2], PyGpuArray_STRIDES(bottom)[3]); return NULL; } if (PyGpuArray_NDIM(weight) != 4) { PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorrMM requires weight to be C-contiguous, " "but strides are: %ld %ld %ld %ld\n", PyGpuArray_STRIDES(weight)[0], PyGpuArray_STRIDES(weight)[1], PyGpuArray_STRIDES(weight)[2], PyGpuArray_STRIDES(weight)[3]); return NULL; } if (PyGpuArray_NDIM(top) != 4) { PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D"); return NULL; } if (!GpuArray_IS_C_CONTIGUOUS(&top->ga)) { PyErr_Format(PyExc_ValueError, "GpuCorrMM requires top to be C-contiguous, " "but strides are: %ld %ld %ld %ld\n", PyGpuArray_STRIDES(top)[0], PyGpuArray_STRIDES(top)[1], PyGpuArray_STRIDES(top)[2], PyGpuArray_STRIDES(top)[3]); return NULL; } // Extract some shape information for later and check shape consistency // bottom: (batchSize, nChannels, bottomHeight, bottomWidth) const size_t batchSize = PyGpuArray_DIMS(bottom)[0]; const size_t nChannels = PyGpuArray_DIMS(bottom)[1]; const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2]; const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3]; // weights: (nFilters, nChannels, rows, columns) const size_t nFilters = PyGpuArray_DIMS(weight)[0]; const size_t kH = PyGpuArray_DIMS(weight)[2]; const size_t kW = PyGpuArray_DIMS(weight)[3]; if (nChannels != PyGpuArray_DIMS(weight)[1]) { PyErr_SetString(PyExc_ValueError, "GpuCorrMM images and kernel must have the same stack size\n"); return NULL; } // implicit dilated filter const size_t dil_kH = (kH - 1) * dilH + 1; const size_t dil_kW = (kW - 1) * dilW + 1; // top: (batchSize, nFilters, topHeight, topWidth) const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH); const size_t topWidthNoDW = (bottomWidth + 2*padW - dil_kW); // the above values might be negative so we need to use Python-like // flooring integer division to be compatible with get_conv_output. // note: this macro implements Python's // for negative x only #define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y)) const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1; const size_t topWidth = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1; #undef _CONV_FLOORDIV if (batchSize != PyGpuArray_DIMS(top)[0] || nFilters != PyGpuArray_DIMS(top)[1] || topHeight != PyGpuArray_DIMS(top)[2] || topWidth != PyGpuArray_DIMS(top)[3]) { PyErr_Format(PyExc_ValueError, "GpuCorrMM shape inconsistency:\n" " bottom shape: %ld %ld %ld %ld\n" " weight shape: %ld %ld %ld %ld\n" " top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n", batchSize, nChannels, bottomHeight, bottomWidth, nFilters, nChannels, kH, kW, PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1], PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3], batchSize, nFilters, topHeight, topWidth); return NULL; } int err = gpublas_setup(bottom->context->ctx); if (err != GA_NO_ERROR) { PyErr_SetString(PyExc_RuntimeError, "Can't setup blas"); return NULL; } // Create temporary columns size_t col_dim[2]; col_dim[0] = nChannels * kW * kH; col_dim[1] = topHeight * topWidth; PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim, bottom->ga.typecode, GA_C_ORDER, bottom->context, Py_None); if (NULL == col) { PyErr_Format(PyExc_RuntimeError, "GpuCorrMM failed to allocate working memory of %ld x %ld\n", col_dim[0], col_dim[1]); return NULL; } // Define some useful variables const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode); const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode); const size_t K_ = col_dim[0]; const size_t N_ = col_dim[1]; const size_t M_ = nFilters; PyGpuArrayObject *output; if (direction == 0) { // forward pass output = top; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorrMM could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // valid correlation: im2col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im2col err = im2col(bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm switch (col->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans, N_, M_, K_, 1, col->ga.data, 0, N_, weight->ga.data, 0, K_, 0, top->ga.data, n * top_stride, N_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorrMM forward encountered an error running gemm: %d", err); Py_DECREF(col); return NULL; } } } else if (direction == 1) { // backprop wrt. weights output = weight; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad wrt. weights could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // valid convolution: im2col, then gemm // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // First, im2col err = im2col(bottom->ga.data, n * bottom_stride, nChannels, bottomHeight, bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW, col->ga.data); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } // Second, gemm // Note that we accumulate into weight. We do so by setting beta = 0 // for the first iteration and beta = 1 for subsequent ones. (This // is faster than setting weight to all zeros before the loop.) switch (col->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans, K_, M_, N_, 1, col->ga.data, 0, N_, top->ga.data, n * top_stride, N_, (n == 0) ? 0 : 1, weight->ga.data, 0, K_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad weights encountered an error running gemm: %d", err); Py_DECREF(col); return NULL; } } } else if (direction == 2) { // backprop wrt. inputs output = bottom; if (batchSize == 0 || nChannels == 0 || nFilters == 0) { err = GpuArray_memset(&output->ga, 0); if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad wrt. inputs could not fill the output with zeros: %d", err); Py_DECREF(col); return NULL; } Py_DECREF(col); return output; } // full convolution: gemm, then col2im // Iterate over batch for (size_t n = 0; n < batchSize; n++) { // gemm into columns switch (top->ga.typecode) { case GA_FLOAT: err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; case GA_DOUBLE: err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; case GA_HALF: err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans, N_, K_, M_, 1, top->ga.data, n * top_stride, N_, weight->ga.data, 0, K_, 0, col->ga.data, 0, N_); break; default: err = GA_UNSUPPORTED_ERROR; } if (err != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuCorrMM grad inputs encountered an error running gemm: %d", err); Py_DECREF(col); return NULL; } // col2im back to the data err = col2im(col->ga.data, nChannels, bottomHeight, bottomWidth, kH, kW, dilH, dilW, padH, padW, dH, dW, bottom->ga.data, n * bottom_stride); if (err != GA_NO_ERROR) { Py_DECREF(col); return NULL; } } } // Free temporary columns Py_DECREF(col); // Note that we don't change the refcount of the output matrix here. Output // (re)allocation and refcounting is done in BaseGpuCorrMM.c_code_helper(); // in here output is just aliased to one of bottom, weights, or top. return output; }