void ConvolutionLayer::forward(std::vector<Blob*> &inputs, std::vector<Blob> &outputs)
    {
        Blob &wgtBlob = blobs[0];

        for (size_t ii = 0; ii < outputs.size(); ii++)
        {
            Blob &inpBlob = *inputs[ii];
            Blob &outBlob = outputs[ii];

            for (int n = 0; n < inpBlob.num(); n++)
            {
                for (int g = 0; g < group; g++)
                {
                    im2col(inpBlob, n, g);

                    Mat kerMat(outGroupCn, ksize, wgtBlob.type(), wgtBlob.ptr(g*outGroupCn));
                    Mat dstMat(outGroupCn, outH*outW, outBlob.type(), outBlob.ptr(n, g*outGroupCn));

                    gemmCPU(kerMat, colMat, 1, dstMat, 0);
                    
                    if (bias)
                    {
                        float *biasPtr = blobs[1].ptrf() + g*outGroupCn;
                        Mat biasMat(outGroupCn, 1, CV_32F, biasPtr);
                        gemmCPU(biasMat, biasOnesMat, 1, dstMat, 1); //TODO: gemv
                    }
                }
            }
        }
    }
示例#2
0
static cv::Mat patchImage(const cv::Mat &image, int patchSize, bool reduceMean=false)
{
    vector<int> blockSize(2, patchSize);
    vector<int> stepSize(2, 1);
    cv::Mat temp = im2col(image, blockSize, stepSize);
    if (! reduceMean)
        return temp;

    cv::Mat mean;
    cv::reduce(temp, mean, 0, cv::REDUCE_AVG);
    cv::Mat res;
    for (int i=0; i<temp.rows; i++)
    {
        cv::Mat temp2 = (temp.row(i) - mean.row(0));
        res.push_back(temp2.row(0));
    }
    return res;
}
示例#3
0
 void im2col_double(const double *img, double *col, int width, int height, int channels,
     int kernel_w, int kernel_h, int pad_w, int pad_h, int stride_w, int stride_h) {
   im2col(img, col, width, height, channels, kernel_w, kernel_h, pad_w, pad_h, stride_w, stride_h);
 }
示例#4
0
 void im2col_float(const float *img, float *col, int width, int height, int channels,
     int kernel_w, int kernel_h, int pad_w, int pad_h, int stride_w, int stride_h) {
   im2col(img, col, width, height, channels, kernel_w, kernel_h, pad_w, pad_h, stride_w, stride_h);
 }
示例#5
0
// Theano op code
// Authors: Arjun Jain, Frederic Bastien, Jan Schluter
// Reference code: https://github.com/BVLC/caffe/blob/master/src/caffe/layers/conv_layer.cu
//   and https://github.com/torch/cunn/blob/master/SpatialConvolutionMM.cu
PyGpuArrayObject* corrMM(PyGpuArrayObject *const bottom,
                         PyGpuArrayObject *const weight,
                         PyGpuArrayObject *const top,
                         const size_t direction,
                         const size_t dH = 1,
                         const size_t dW = 1,
                         const size_t dilH = 1,
                         const size_t dilW = 1,
                         const size_t padH = 0,
                         const size_t padW = 0)
{
    if (PyGpuArray_NDIM(bottom) != 4)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires bottom of 4D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&bottom->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorrMM requires bottom to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(bottom)[0],
                PyGpuArray_STRIDES(bottom)[1],
                PyGpuArray_STRIDES(bottom)[2],
                PyGpuArray_STRIDES(bottom)[3]);
        return NULL;
    }

    if (PyGpuArray_NDIM(weight) != 4)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires weight of 4D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&weight->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorrMM requires weight to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(weight)[0],
                PyGpuArray_STRIDES(weight)[1],
                PyGpuArray_STRIDES(weight)[2],
                PyGpuArray_STRIDES(weight)[3]);
        return NULL;
    }

    if (PyGpuArray_NDIM(top) != 4)
    {
        PyErr_SetString(PyExc_ValueError, "GpuCorrMM requires top of 4D");
        return NULL;
    }
    if (!GpuArray_IS_C_CONTIGUOUS(&top->ga))
    {
        PyErr_Format(PyExc_ValueError,
                "GpuCorrMM requires top to be C-contiguous, "
                "but strides are: %ld %ld %ld %ld\n",
                PyGpuArray_STRIDES(top)[0],
                PyGpuArray_STRIDES(top)[1],
                PyGpuArray_STRIDES(top)[2],
                PyGpuArray_STRIDES(top)[3]);
        return NULL;
    }

    // Extract some shape information for later and check shape consistency
    // bottom: (batchSize, nChannels, bottomHeight, bottomWidth)
    const size_t batchSize = PyGpuArray_DIMS(bottom)[0];
    const size_t nChannels = PyGpuArray_DIMS(bottom)[1];
    const size_t bottomHeight = PyGpuArray_DIMS(bottom)[2];
    const size_t bottomWidth = PyGpuArray_DIMS(bottom)[3];
    // weights: (nFilters, nChannels, rows, columns)
    const size_t nFilters = PyGpuArray_DIMS(weight)[0];
    const size_t kH = PyGpuArray_DIMS(weight)[2];
    const size_t kW = PyGpuArray_DIMS(weight)[3];
    if (nChannels != PyGpuArray_DIMS(weight)[1]) {
        PyErr_SetString(PyExc_ValueError,
                "GpuCorrMM images and kernel must have the same stack size\n");
        return NULL;
    }
    // implicit dilated filter
    const size_t dil_kH = (kH - 1) * dilH + 1;
    const size_t dil_kW = (kW - 1) * dilW + 1;
    // top: (batchSize, nFilters, topHeight, topWidth)
    const size_t topHeightNoDH = (bottomHeight + 2*padH - dil_kH);
    const size_t topWidthNoDW  = (bottomWidth + 2*padW - dil_kW);
    // the above values might be negative so we need to use Python-like
    // flooring integer division to be compatible with get_conv_output.
    // note: this macro implements Python's // for negative x only
#define _CONV_FLOORDIV_X(x,y) ((x < 0) ? (- ((-x) / y) - (((-x) % y) == 0 ? 0 : 1)) : (x / y))
    const size_t topHeight = _CONV_FLOORDIV_X(topHeightNoDH, dH) + 1;
    const size_t topWidth  = _CONV_FLOORDIV_X(topWidthNoDW, dW) + 1;
#undef _CONV_FLOORDIV
    if (batchSize != PyGpuArray_DIMS(top)[0] ||
            nFilters != PyGpuArray_DIMS(top)[1] ||
            topHeight != PyGpuArray_DIMS(top)[2] ||
            topWidth != PyGpuArray_DIMS(top)[3]) {
        PyErr_Format(PyExc_ValueError,
                "GpuCorrMM shape inconsistency:\n"
                "  bottom shape: %ld %ld %ld %ld\n"
                "  weight shape: %ld %ld %ld %ld\n"
                "  top shape: %ld %ld %ld %ld (expected %ld %ld %ld %ld)\n",
                batchSize, nChannels, bottomHeight, bottomWidth,
                nFilters, nChannels, kH, kW,
                PyGpuArray_DIMS(top)[0], PyGpuArray_DIMS(top)[1],
                PyGpuArray_DIMS(top)[2], PyGpuArray_DIMS(top)[3],
                batchSize, nFilters, topHeight, topWidth);
        return NULL;
    }

    int err = gpublas_setup(bottom->context->ctx);
    if (err != GA_NO_ERROR) {
        PyErr_SetString(PyExc_RuntimeError, "Can't setup blas");
        return NULL;
    }

    // Create temporary columns
    size_t col_dim[2];
    col_dim[0] = nChannels * kW * kH;
    col_dim[1] = topHeight * topWidth;
    PyGpuArrayObject* col = (PyGpuArrayObject*)pygpu_empty(2, col_dim,
                                                           bottom->ga.typecode,
                                                           GA_C_ORDER,
                                                           bottom->context,
                                                           Py_None);
    if (NULL == col) {
        PyErr_Format(PyExc_RuntimeError,
                "GpuCorrMM failed to allocate working memory of %ld x %ld\n",
                col_dim[0], col_dim[1]);
        return NULL;
    }

    // Define some useful variables
    const size_t bottom_stride = PyGpuArray_STRIDES(bottom)[0] / gpuarray_get_elsize(bottom->ga.typecode);
    const size_t top_stride = PyGpuArray_STRIDES(top)[0] / gpuarray_get_elsize(top->ga.typecode);
    const size_t K_ = col_dim[0];
    const size_t N_ = col_dim[1];
    const size_t M_ = nFilters;

    PyGpuArrayObject *output;
    if (direction == 0) {  // forward pass
        output = top;
        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
            err = GpuArray_memset(&output->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            Py_DECREF(col);
            return output;
        }
        // valid correlation: im2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
            err = im2col(bottom->ga.data, n * bottom_stride,
                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
                         padH, padW, dH, dW, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
            switch (col->ga.typecode) {
            case GA_FLOAT:
              err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                  N_, M_, K_, 1,
                                  col->ga.data, 0, N_,
                                  weight->ga.data, 0, K_,
                                  0,
                                  top->ga.data, n * top_stride, N_);
              break;
            case GA_DOUBLE:
              err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                  N_, M_, K_, 1,
                                  col->ga.data, 0, N_,
                                  weight->ga.data, 0, K_,
                                  0,
                                  top->ga.data, n * top_stride, N_);
              break;
            case GA_HALF:
              err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_no_trans,
                                  N_, M_, K_, 1,
                                  col->ga.data, 0, N_,
                                  weight->ga.data, 0, K_,
                                  0,
                                  top->ga.data, n * top_stride, N_);
              break;
            default:
              err = GA_UNSUPPORTED_ERROR;
            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM forward encountered an error running gemm: %d", err);
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    else if (direction == 1) {  // backprop wrt. weights
        output = weight;
        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
            err = GpuArray_memset(&output->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM grad wrt. weights could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            Py_DECREF(col);
            return output;
        }
        // valid convolution: im2col, then gemm
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // First, im2col
            err = im2col(bottom->ga.data, n * bottom_stride,
                         nChannels, bottomHeight,
                         bottomWidth, kH, kW, dilH, dilW,
                         padH, padW, dH, dW, col->ga.data);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
            // Second, gemm
            // Note that we accumulate into weight. We do so by setting beta = 0
            // for the first iteration and beta = 1 for subsequent ones. (This
            // is faster than setting weight to all zeros before the loop.)
            switch (col->ga.typecode) {
            case GA_FLOAT:
              err = gpublas_sgemm(cb_fortran, cb_trans, cb_no_trans,
                                  K_, M_, N_, 1,
                                  col->ga.data, 0, N_,
                                  top->ga.data, n * top_stride, N_,
                                  (n == 0) ? 0 : 1,
                                  weight->ga.data, 0, K_);
              break;
            case GA_DOUBLE:
              err = gpublas_dgemm(cb_fortran, cb_trans, cb_no_trans,
                                  K_, M_, N_, 1,
                                  col->ga.data, 0, N_,
                                  top->ga.data, n * top_stride, N_,
                                  (n == 0) ? 0 : 1,
                                  weight->ga.data, 0, K_);
              break;
            case GA_HALF:
              err = gpublas_hgemm(cb_fortran, cb_trans, cb_no_trans,
                                  K_, M_, N_, 1,
                                  col->ga.data, 0, N_,
                                  top->ga.data, n * top_stride, N_,
                                  (n == 0) ? 0 : 1,
                                  weight->ga.data, 0, K_);
              break;
            default:
                err = GA_UNSUPPORTED_ERROR;
            }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM grad weights encountered an error running gemm: %d", err);
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    else if (direction == 2) {  // backprop wrt. inputs
        output = bottom;
        if (batchSize == 0 || nChannels == 0 || nFilters == 0) {
            err = GpuArray_memset(&output->ga, 0);
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM grad wrt. inputs could not fill the output with zeros: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            Py_DECREF(col);
            return output;
        }
        // full convolution: gemm, then col2im
        // Iterate over batch
        for (size_t n = 0; n < batchSize; n++) {
            // gemm into columns
          switch (top->ga.typecode) {
          case GA_FLOAT:
            err = gpublas_sgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                0,
                                col->ga.data, 0, N_);
            break;
          case GA_DOUBLE:
            err = gpublas_dgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                0,
                                col->ga.data, 0, N_);
            break;
          case GA_HALF:
            err = gpublas_hgemm(cb_fortran, cb_no_trans, cb_trans,
                                N_, K_, M_, 1,
                                top->ga.data, n * top_stride, N_,
                                weight->ga.data, 0, K_,
                                0,
                                col->ga.data, 0, N_);
            break;
          default:
            err = GA_UNSUPPORTED_ERROR;
          }
            if (err != GA_NO_ERROR) {
                PyErr_Format(PyExc_RuntimeError,
                             "GpuCorrMM grad inputs encountered an error running gemm: %d", err);
                Py_DECREF(col);
                return NULL;
            }
            // col2im back to the data
            err = col2im(col->ga.data, nChannels, bottomHeight, bottomWidth,
                         kH, kW, dilH, dilW, padH, padW,
                         dH, dW, bottom->ga.data, n * bottom_stride);
            if (err != GA_NO_ERROR) {
                Py_DECREF(col);
                return NULL;
            }
        }
    }
    // Free temporary columns
    Py_DECREF(col);

    // Note that we don't change the refcount of the output matrix here. Output
    // (re)allocation and refcounting is done in BaseGpuCorrMM.c_code_helper();
    // in here output is just aliased to one of bottom, weights, or top.
    return output;
}