// this runs an entire kernel to get one value. Clearly this is going to be pretty slow, but // at least it's more or less compatible, and comparable, to how cutorch does it // lgfgs expects a working implementation of this method float THClStorage_get(THClState *state, const THClStorage *self, long index) { //// printf("THClStorage_get\n"); THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty"); // if( self->wrapper->isDeviceDirty() ) { // if(state->trace) cout << "wrapper->copyToHost()" << endl; // self->wrapper->copyToHost(); // } // return self->data[index]; const char *uniqueName = __FILE__ ":get"; EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getGetKernelSource(), "THClStorageGet" ); } float res; kernel->out(1, &res); kernel->in(self->wrapper); kernel->in((int64_t)index); kernel->run_1d(1, 1); if(state->addFinish) cl->finish(); return res; }
// this runs an entire kernel to get one value. Clearly this is going to be pretty slow, but // at least it's more or less compatible, and comparable, to how cutorch does it void THClStorage_set(THClState *state, THClStorage *self, long index, float value) { //// cout << "set size=" << self->size << " index=" << index << " value=" << value << endl; THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty"); // if( self->wrapper->isDeviceDirty() ) { // we have to do this, since we're going to copy it all back again // // although I suppose we could set via a kernel perhaps // // either way, this function is pretty inefficient right now :-P // if(state->trace) cout << "wrapper->copyToHost() size " << self->size << endl; // self->wrapper->copyToHost(); // } // self->data[index] = value; // if(state->trace) cout << "wrapper->copyToDevice() size " << self->size << endl; // self->wrapper->copyToDevice(); const char *uniqueName = __FILE__ ":set"; EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getSetKernelSource(), "THClStorageSet" ); } kernel->inout(self->wrapper); kernel->in((int64_t)index); kernel->in(value); kernel->run_1d(1, 1); if(state->addFinish) cl->finish(); }
void col2im(THClState *state, THClTensor* col, const int channels, const int height, const int width, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, THClTensor* im) { int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; int num_kernels = channels * height * width; // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. EasyCL *cl = im->storage->cl; std::string uniqueName = "SpatialConvolutionMM::col2im"; CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel(uniqueName, "SpatialConvolutionMM.cl", SpatialConvolutionMM_getKernelTemplate(), "col2im_kernel"); } THClKernels k(state, kernel); k.in(num_kernels); k.in(col); k.in(height); k.in(width); k.in(channels); k.in(patch_h); k.in(patch_w); k.in(pad_h); k.in(pad_w); k.in(stride_h); k.in(stride_w); k.in(height_col); k.in(width_col); k.out(im); k.run(GET_BLOCKS(state, num_kernels), getNumThreads(state)); // col2im_kernel <<<GET_BLOCKS(num_kernels), CL_NUM_THREADS, 0, stream>>> ( // num_kernels, data_col, height, width, channels, // patch_h, patch_w, pad_h, pad_w, stride_h, stride_w, // height_col, width_col, data_im // ); // THError("Not implemented"); }
void im2col(THClState *state, THClTensor* im, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, THClTensor* col) { // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. int height_col = (height + 2 * pad_h - ksize_h) / stride_h + 1; int width_col = (width + 2 * pad_w - ksize_w) / stride_w + 1; int num_kernels = channels * height_col * width_col; std::string uniqueName = "SpatialConvolutionMM::im2col"; EasyCL *cl = im->storage->cl; CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel(uniqueName, "SpatialConvolutionMM.cl", SpatialConvolutionMM_getKernelTemplate(), "im2col_kernel"); } THClKernels k(state, kernel); k.in(num_kernels); k.in(im); k.in(height); k.in(width); k.in(ksize_h); k.in(ksize_w); k.in(pad_h); k.in(pad_w); k.in(stride_h); k.in(stride_w); k.in(height_col); k.in(width_col); k.out(col); k.run(GET_BLOCKS(state, num_kernels), getNumThreads(state)); // Launch // im2col_kernel <<<GET_BLOCKS(num_kernels), CL_NUM_THREADS, 0, stream>>> ( // num_kernels, data_im, height, width, ksize_h, ksize_w, // pad_h, pad_w, stride_h, stride_w, // height_col, width_col, data_col // ); }
TEST(testkernelstore, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); EXPECT_FALSE(cl->kernelExists("kernela")); EXPECT_FALSE(cl->kernelExists("kernelb")); cl->storeKernel("kernela", kernel); EXPECT_TRUE(cl->kernelExists("kernela")); EXPECT_FALSE(cl->kernelExists("kernelb")); EXPECT_EQ(kernel, cl->getKernel("kernela")); delete kernel; delete cl; }
void kernelLaunch_THClTensor_reduceNoncontigDim( THClState *state, dim3 &grid, dim3 &block, int ADims, int BDims, TensorInfo<IndexType> out, TensorInfo<IndexType> in, IndexType reductionStride, IndexType reductionSize, IndexType totalSlices, float init, HasOperator2 const*modifyOp, HasOperator3 const*reduceOp) { // cl->finish(); StatefulTimer::timeCheck("Reduce-noncontig START"); std::string uniqueName = "THClTensor_reduceNoncontigDim_" + easycl::toString(ADims) + "_" + easycl::toString(BDims) + "_" + TypeParseTraits<IndexType>::name + "_" + modifyOp->operator2() + "_" + reduceOp->operator3(); EasyCL *cl = in.wrapper->getCl(); CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); StatefulTimer::timeCheck("Apply3 1aa"); } else { // launch kernel here.... TemplatedKernel kernelBuilder(cl); std::set<int> dims_set; if(ADims >= 0) { dims_set.insert(ADims); } if(BDims >= 0) { dims_set.insert(BDims); } std::vector<int> dims; for( std::set<int>::iterator it = dims_set.begin(); it != dims_set.end(); it++ ) { dims.push_back(*it); } std::string indexType = TypeParseTraits<IndexType>::name; kernelBuilder .set("include_THClReduceApplyUtils", THClReduceApplyUtils_getKernelTemplate()) .set("dims", dims) .set("dim1", ADims) .set("dim2", BDims) .set("defreduceblock", 1) .set("WarpSize", 32) // probably can do like 'if nvidia 32 else 64' ? .set("MAX_CLTORCH_DIMS", MAX_CLTORCH_DIMS) .set("IndexType", indexType) .set("modify_operation", modifyOp->operator2()) .set("reduce_operation", reduceOp->operator3()) ; kernel = kernelBuilder.buildKernel(uniqueName, "THClReduce.cl", THClReduce_getKernelSource(), "THClTensor_reduceNoncontigDim"); } THClKernels k(state, kernel); k.out(out); k.in(in); k.in((int)reductionStride); k.in((int)reductionSize); k.in((int)totalSlices); k.in(init); k.run(grid, block); if(state->addFinish) cl->finish(); StatefulTimer::timeCheck("Reduce-noncontig END"); }
static int clnn_SpatialMaxPooling_updateOutput(lua_State *L) { THClState *state = getCltorchState(L); THClTensor *input = (THClTensor *)luaT_checkudata(L, 2, "torch.ClTensor"); int kW = luaT_getfieldcheckint(L, 1, "kW"); int kH = luaT_getfieldcheckint(L, 1, "kH"); int dW = luaT_getfieldcheckint(L, 1, "dW"); int dH = luaT_getfieldcheckint(L, 1, "dH"); int padW = luaT_getfieldcheckint(L, 1, "padW"); int padH = luaT_getfieldcheckint(L, 1, "padH"); bool ceil_mode = luaT_getfieldcheckboolean(L, 1, "ceil_mode"); THClTensor *output = (THClTensor *)luaT_getfieldcheckudata(L, 1, "output", "torch.ClTensor"); THClTensor *indices = (THClTensor *)luaT_getfieldcheckudata(L, 1, "indices", "torch.ClTensor"); THAssert(THClTensor_checkGPU(state, 3, input, output, indices)); luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); long nInputCols, nInputRows, nInputPlane, batchSize; long nOutputCols, nOutputRows; if (input->nDimension == 3) { nInputCols = input->size[2]; nInputRows = input->size[1]; nInputPlane = input->size[0]; batchSize = 1; } else { nInputCols = input->size[3]; nInputRows = input->size[2]; nInputPlane = input->size[1]; batchSize = input->size[0]; } luaL_argcheck(L, nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size"); luaL_argcheck(L, kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size"); if(ceil_mode) { nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1; } else { nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1; nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1; } input = THClTensor_newContiguous(state, input); //float* input_data = THClTensor_data(state, input); THClTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols); THClTensor_resizeAs(state, indices, output); //float* indices_data = THClTensor_data(state, indices); // float* output_data = THClTensor_data(state, output); int count = THClTensor_nElement(state, output); EasyCL *cl = input->storage->cl; std::string uniqueName = __FILE__ "MaxPoolForward"; CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernelBuilder.set("forward", 1); kernelBuilder.set("Dtype", "float"); kernel = kernelBuilder.buildKernel(uniqueName, __FILE__, SpatialMaxPooling_getKernelTemplate(), "MaxPoolForward"); } THClKernels k(state, kernel); k.in((int)count); k.in(input); k.in((int)batchSize); k.in((int)nInputPlane); k.in((int)nInputRows); k.in((int)nInputCols); k.in((int)nOutputRows); k.in((int)nOutputCols); k.in((int)kH); k.in((int)kW); k.in((int)dH); k.in((int)dW); k.in((int)padH); k.in((int)padW); k.out(output); k.out(indices); dim3 blocks(GET_BLOCKS(state, count)); dim3 threads(GET_CL_NUM_THREADS(state)); k.run(blocks, threads); // maxpool <<<blocks, threads, 0, THClState_getCurrentStream(state)>>> ( // input_data, output_data, // indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data, // nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW); // THError("not implemented"); if(input->nDimension == 3) THClTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols); THClTensor_free(state, input); return 1; }
static int clnn_SpatialMaxPooling_updateGradInput(lua_State *L) { THClState *state = getCltorchState(L); THClTensor *input = (THClTensor *)luaT_checkudata(L, 2, "torch.ClTensor"); THClTensor *gradOutput = (THClTensor *)luaT_checkudata(L, 3, "torch.ClTensor"); int kW = luaT_getfieldcheckint(L, 1, "kW"); int kH = luaT_getfieldcheckint(L, 1, "kH"); int dW = luaT_getfieldcheckint(L, 1, "dW"); int dH = luaT_getfieldcheckint(L, 1, "dH"); int padW = luaT_getfieldcheckint(L, 1, "padW"); int padH = luaT_getfieldcheckint(L, 1, "padH"); bool ceil_mode = luaT_getfieldcheckboolean(L, 1, "ceil_mode"); THClTensor *gradInput = (THClTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", "torch.ClTensor"); THClTensor *indices = (THClTensor *)luaT_getfieldcheckudata(L, 1, "indices", "torch.ClTensor"); THAssert(THClTensor_checkGPU(state, 4, input, gradOutput, indices, gradInput)); input = THClTensor_newContiguous(state, input); gradOutput = THClTensor_newContiguous(state, gradOutput); long nInputCols, nInputRows, nInputPlane, batchSize; long nOutputCols, nOutputRows; if (input->nDimension == 3) { nInputCols = input->size[2]; nInputRows = input->size[1]; nInputPlane = input->size[0]; batchSize = 1; } else { nInputCols = input->size[3]; nInputRows = input->size[2]; nInputPlane = input->size[1]; batchSize = input->size[0]; } if(ceil_mode) { nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1; } else { nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1; nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1; } gradOutput = THClTensor_newContiguous(state, gradOutput); THClTensor_resizeAs(state, gradInput, input); int count = THClTensor_nElement(state, input); EasyCL *cl = input->storage->cl; std::string uniqueName = __FILE__ "MaxPoolBackward"; CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernelBuilder.set("backward", 1); kernelBuilder.set("Dtype", "float"); kernel = kernelBuilder.buildKernel(uniqueName, __FILE__, SpatialMaxPooling_getKernelTemplate(), "MaxPoolBackward"); } THClKernels k(state, kernel); k.in((int)count); k.in(gradOutput); k.in(indices); k.in((int)batchSize); k.in((int)nInputPlane); k.in((int)nInputRows); k.in((int)nInputCols); k.in((int)nOutputRows); k.in((int)nOutputCols); k.in((int)kH); k.in((int)kW); k.in((int)dH); k.in((int)dW); k.in((int)padH); k.in((int)padW); k.out(gradInput); dim3 blocks(GET_BLOCKS(state, count)); dim3 threads(GET_CL_NUM_THREADS(state)); k.run(blocks, threads); // MaxPoolBackward <<< GET_BLOCKS(count), CL_NUM_THREADS, 0, THClState_getCurrentStream(state) >>> // (count, // THClTensor_data(state, gradOutput), // THClTensor_data(state, indices), // batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, // kH, kW, dH, dW, padH, padW, // THClTensor_data(state, gradInput)); // clean THClTensor_free(state, input); THClTensor_free(state, gradOutput); return 1; }
void THNN_ClSpatialAveragePooling_updateOutput(THClState *state, THClTensor *input, THClTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad) { THAssert(THClTensor_checkGPU(state, 2, input, output)); THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected"); long nInputCols, nInputRows, nInputPlane, batchSize; long nOutputCols, nOutputRows; if (input->nDimension == 3) { nInputCols = input->size[2]; nInputRows = input->size[1]; nInputPlane = input->size[0]; batchSize = 1; } else { nInputCols = input->size[3]; nInputRows = input->size[2]; nInputPlane = input->size[1]; batchSize = input->size[0]; } THArgCheck(nInputCols >= kW - 2*padW && nInputRows >= kH - 2*padH, 2, "input image smaller than kernel size"); THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size"); if(ceil_mode) { nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1; nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1; } else { nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1; nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1; } if (padW || padH) { // ensure that the last pooling starts inside the image // needed to avoid problems in ceil mode if ((nOutputRows - 1)*dH >= nInputRows + padH) --nOutputRows; if ((nOutputCols - 1)*dW >= nInputCols + padW) --nOutputCols; } input = THClTensor_newContiguous(state, input); // input_data = THClTensor_data(state, input); THClTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols); // float* output_data = THClTensor_data(state, output); int count = THClTensor_nElement(state, output); EasyCL *cl = input->storage->cl; const char *uniqueName = 0; if(count_include_pad) { uniqueName = __FILE__ "forward_includepad"; } else { uniqueName = __FILE__ "forward_not_includepad"; } CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernelBuilder.set("forward", 1); kernelBuilder.set("Dtype", "float"); kernelBuilder.set("COUNT_INCLUDE_PAD", count_include_pad); kernel = kernelBuilder.buildKernel(uniqueName, __FILE__, getKernelTemplate(), "AvePoolForward"); } THClKernels k(state, kernel); k.in((int)count); k.in(input); k.in((int)batchSize); k.in((int)nInputPlane); k.in((int)nInputRows); k.in((int)nInputCols); k.in((int)nOutputRows); k.in((int)nOutputCols); k.in((int)kH); k.in((int)kW); k.in((int)dH); k.in((int)dW); k.in((int)padH); k.in((int)padW); k.out(output); dim3 blocks(GET_BLOCKS(state, count)); dim3 threads(GET_CL_NUM_THREADS(state)); k.run(blocks, threads); // AvePoolForward<float, true> // <<<GET_BLOCKS(count), CL_NUM_THREADS, 0, THClState_getCurrentStream(state) >>>( // count, input_data, // batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols, // kH, kW, dH, dW, padH, padW, output_data); if(input->nDimension == 3) THClTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols); THClTensor_free(state, input); }