Пример #1
0
// this runs an entire kernel to get one value.  Clearly this is going to be pretty slow, but
// at least it's more or less compatible, and comparable, to how cutorch does it
// lgfgs expects a working implementation of this method
float THClStorage_get(THClState *state, const THClStorage *self, long index)
{
////  printf("THClStorage_get\n");
  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
  THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty");

//  if( self->wrapper->isDeviceDirty() ) {
//    if(state->trace) cout << "wrapper->copyToHost()" << endl;
//    self->wrapper->copyToHost();
//  }
//  return self->data[index];

  const char *uniqueName = __FILE__ ":get";
  EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getGetKernelSource(), "THClStorageGet" );
  }

  float res;
  kernel->out(1, &res);
  kernel->in(self->wrapper);
  kernel->in((int64_t)index);
  kernel->run_1d(1, 1);

  if(state->addFinish) cl->finish();
  return res;
}
Пример #2
0
// this runs an entire kernel to get one value.  Clearly this is going to be pretty slow, but
// at least it's more or less compatible, and comparable, to how cutorch does it
void THClStorage_set(THClState *state, THClStorage *self, long index, float value)
{
////  cout << "set size=" << self->size << " index=" << index << " value=" << value << endl;
  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
  THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty");
//  if( self->wrapper->isDeviceDirty() ) { // we have to do this, since we're going to copy it all back again
//                                         // although I suppose we could set via a kernel perhaps
//                                         // either way, this function is pretty inefficient right now :-P
//    if(state->trace) cout << "wrapper->copyToHost() size " << self->size << endl;
//    self->wrapper->copyToHost();
//  }
//  self->data[index] = value;
//  if(state->trace) cout << "wrapper->copyToDevice() size " << self->size << endl;
//  self->wrapper->copyToDevice();

  const char *uniqueName = __FILE__ ":set";
  EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getSetKernelSource(), "THClStorageSet" );
  }

  kernel->inout(self->wrapper);
  kernel->in((int64_t)index);
  kernel->in(value);
  kernel->run_1d(1, 1);

  if(state->addFinish) cl->finish();
}
Пример #3
0
TEST(testkernelstore, main) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", "");
    EXPECT_FALSE(cl->kernelExists("kernela"));
    EXPECT_FALSE(cl->kernelExists("kernelb"));

    cl->storeKernel("kernela", kernel);
    EXPECT_TRUE(cl->kernelExists("kernela"));
    EXPECT_FALSE(cl->kernelExists("kernelb"));

    EXPECT_EQ(kernel, cl->getKernel("kernela"));

    delete kernel;
    delete cl;
}
Пример #4
0
void col2im(THClState *state, THClTensor* col, const int channels,
    const int height, const int width, const int patch_h, const int patch_w, const int pad_h,
    const int pad_w, const int stride_h, const int stride_w, THClTensor* im) {
  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
  int num_kernels = channels * height * width;
  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.

  EasyCL *cl = im->storage->cl;
  std::string uniqueName = "SpatialConvolutionMM::col2im";
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel(uniqueName, "SpatialConvolutionMM.cl",
      SpatialConvolutionMM_getKernelTemplate(), "col2im_kernel");
  }

  THClKernels k(state, kernel);
  k.in(num_kernels);
  k.in(col);
  k.in(height);
  k.in(width);
  k.in(channels);

  k.in(patch_h);
  k.in(patch_w);
  k.in(pad_h);
  k.in(pad_w);
  k.in(stride_h);
  k.in(stride_w);

  k.in(height_col);
  k.in(width_col);
  k.out(im);

  k.run(GET_BLOCKS(state, num_kernels), getNumThreads(state));

//  col2im_kernel <<<GET_BLOCKS(num_kernels), CL_NUM_THREADS, 0, stream>>> (
//      num_kernels, data_col, height, width, channels,
//      patch_h, patch_w, pad_h, pad_w, stride_h, stride_w,
//      height_col, width_col, data_im
//  );
//  THError("Not implemented");
}
Пример #5
0
void im2col(THClState *state, THClTensor* im, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
    const int pad_w, const int stride_h, const int stride_w, THClTensor* col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
  int height_col = (height + 2 * pad_h - ksize_h) / stride_h + 1;
  int width_col = (width + 2 * pad_w - ksize_w) / stride_w + 1;
  int num_kernels = channels * height_col * width_col;

  std::string uniqueName = "SpatialConvolutionMM::im2col";
  EasyCL *cl = im->storage->cl;
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel(uniqueName, "SpatialConvolutionMM.cl",
      SpatialConvolutionMM_getKernelTemplate(), "im2col_kernel");
  }

  THClKernels k(state, kernel);
  k.in(num_kernels);
  k.in(im);
  k.in(height);
  k.in(width);
  k.in(ksize_h);
  k.in(ksize_w);
  k.in(pad_h);
  k.in(pad_w);
  k.in(stride_h);
  k.in(stride_w);
  k.in(height_col);
  k.in(width_col);
  k.out(col);

  k.run(GET_BLOCKS(state, num_kernels), getNumThreads(state));

  // Launch
//  im2col_kernel <<<GET_BLOCKS(num_kernels), CL_NUM_THREADS, 0, stream>>> (
//      num_kernels, data_im, height, width, ksize_h, ksize_w,
//      pad_h, pad_w, stride_h, stride_w,
//      height_col, width_col, data_col
//  );
}
Пример #6
0
void kernelLaunch_THClTensor_reduceNoncontigDim(
  THClState *state,
  dim3 &grid,
  dim3 &block,
  int ADims,
  int BDims,
  TensorInfo<IndexType> out,
  TensorInfo<IndexType> in,
  IndexType reductionStride,
  IndexType reductionSize,
  IndexType totalSlices,
  float init,
  HasOperator2 const*modifyOp,
  HasOperator3 const*reduceOp) {

//  cl->finish();
  StatefulTimer::timeCheck("Reduce-noncontig START");
  std::string uniqueName = "THClTensor_reduceNoncontigDim_" + easycl::toString(ADims) + "_" + easycl::toString(BDims) + "_" +
    TypeParseTraits<IndexType>::name + "_" + modifyOp->operator2() + "_" + reduceOp->operator3();
  EasyCL *cl = in.wrapper->getCl();
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
    StatefulTimer::timeCheck("Apply3 1aa");
  } else {
    // launch kernel here....
    TemplatedKernel kernelBuilder(cl);

    std::set<int> dims_set;
    if(ADims >= 0) {
      dims_set.insert(ADims);
    }
    if(BDims >= 0) {
      dims_set.insert(BDims);
    }
    std::vector<int> dims;
    for( std::set<int>::iterator it = dims_set.begin(); it != dims_set.end(); it++ ) {
      dims.push_back(*it);
    }

    std::string indexType = TypeParseTraits<IndexType>::name;
    kernelBuilder
      .set("include_THClReduceApplyUtils", THClReduceApplyUtils_getKernelTemplate())
      .set("dims", dims)
      .set("dim1", ADims)
      .set("dim2", BDims)
      .set("defreduceblock", 1)
      .set("WarpSize", 32) // probably can do like 'if nvidia 32 else 64' ?
      .set("MAX_CLTORCH_DIMS", MAX_CLTORCH_DIMS)
      .set("IndexType", indexType)
      .set("modify_operation", modifyOp->operator2())
      .set("reduce_operation", reduceOp->operator3())
    ;

    kernel = kernelBuilder.buildKernel(uniqueName, "THClReduce.cl", THClReduce_getKernelSource(), "THClTensor_reduceNoncontigDim");
  }

  THClKernels k(state, kernel);
  k.out(out);
  k.in(in);
  k.in((int)reductionStride);
  k.in((int)reductionSize);
  k.in((int)totalSlices);
  k.in(init);

  k.run(grid, block);

  if(state->addFinish) cl->finish();  
  StatefulTimer::timeCheck("Reduce-noncontig END");
}
Пример #7
0
static int clnn_SpatialMaxPooling_updateOutput(lua_State *L)
{
  THClState *state = getCltorchState(L);
  THClTensor *input = (THClTensor *)luaT_checkudata(L, 2, "torch.ClTensor");
  int kW = luaT_getfieldcheckint(L, 1, "kW");
  int kH = luaT_getfieldcheckint(L, 1, "kH");
  int dW = luaT_getfieldcheckint(L, 1, "dW");
  int dH = luaT_getfieldcheckint(L, 1, "dH");
  int padW = luaT_getfieldcheckint(L, 1, "padW");
  int padH = luaT_getfieldcheckint(L, 1, "padH");
  bool ceil_mode = luaT_getfieldcheckboolean(L, 1, "ceil_mode");

  THClTensor *output = (THClTensor *)luaT_getfieldcheckudata(L, 1, "output", "torch.ClTensor");
  THClTensor *indices = (THClTensor *)luaT_getfieldcheckudata(L, 1, "indices", "torch.ClTensor");

  THAssert(THClTensor_checkGPU(state, 3, input, output, indices));
  luaL_argcheck(L, input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");

  long nInputCols, nInputRows, nInputPlane, batchSize;
  long nOutputCols, nOutputRows;

  if (input->nDimension == 3) {
    nInputCols = input->size[2];
    nInputRows = input->size[1];
    nInputPlane = input->size[0];
    batchSize = 1;
  }
  else
  {
    nInputCols = input->size[3];
    nInputRows = input->size[2];
    nInputPlane = input->size[1];
    batchSize = input->size[0];
  }

  luaL_argcheck(L, nInputCols >= kW - padW && nInputRows >= kH - padH, 2, "input image smaller than kernel size");
  luaL_argcheck(L, kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");

  if(ceil_mode) {
    nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
    nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
  }
  else {
    nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
    nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
  }

  input = THClTensor_newContiguous(state, input);
  //float* input_data = THClTensor_data(state, input);

  THClTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
  THClTensor_resizeAs(state, indices, output);
  
  //float* indices_data = THClTensor_data(state, indices);
 // float* output_data = THClTensor_data(state, output);

  int count = THClTensor_nElement(state, output);

  EasyCL *cl = input->storage->cl;
  std::string uniqueName = __FILE__ "MaxPoolForward";
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernelBuilder.set("forward", 1);
    kernelBuilder.set("Dtype", "float");
    kernel = kernelBuilder.buildKernel(uniqueName, __FILE__,
      SpatialMaxPooling_getKernelTemplate(), "MaxPoolForward");
  }

  THClKernels k(state, kernel);
  k.in((int)count);
  k.in(input);
  k.in((int)batchSize);
  k.in((int)nInputPlane);
  k.in((int)nInputRows);
  k.in((int)nInputCols);
  k.in((int)nOutputRows);
  k.in((int)nOutputCols);
  k.in((int)kH);
  k.in((int)kW);
  k.in((int)dH);
  k.in((int)dW);
  k.in((int)padH);
  k.in((int)padW);
  k.out(output);
  k.out(indices);

  dim3 blocks(GET_BLOCKS(state, count));
  dim3 threads(GET_CL_NUM_THREADS(state));
  k.run(blocks, threads);

//    maxpool <<<blocks, threads, 0, THClState_getCurrentStream(state)>>> (
//      input_data, output_data,
//      indices_data+nbatch*nInputPlane*nOutputCols*nOutputRows, indices_data,
//      nInputPlane, nInputRows, nInputCols, kH, kW, dH, dW);
//    THError("not implemented");

  if(input->nDimension == 3)
    THClTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);

  THClTensor_free(state, input);

  return 1;
}
Пример #8
0
static int clnn_SpatialMaxPooling_updateGradInput(lua_State *L)
{
  THClState *state = getCltorchState(L);
  THClTensor *input = (THClTensor *)luaT_checkudata(L, 2, "torch.ClTensor");
  THClTensor *gradOutput = (THClTensor *)luaT_checkudata(L, 3, "torch.ClTensor");
  int kW = luaT_getfieldcheckint(L, 1, "kW");
  int kH = luaT_getfieldcheckint(L, 1, "kH");
  int dW = luaT_getfieldcheckint(L, 1, "dW");
  int dH = luaT_getfieldcheckint(L, 1, "dH");
  int padW = luaT_getfieldcheckint(L, 1, "padW");
  int padH = luaT_getfieldcheckint(L, 1, "padH");
  bool ceil_mode = luaT_getfieldcheckboolean(L, 1, "ceil_mode");

  THClTensor *gradInput = (THClTensor *)luaT_getfieldcheckudata(L, 1, "gradInput", "torch.ClTensor");
  THClTensor *indices = (THClTensor *)luaT_getfieldcheckudata(L, 1, "indices", "torch.ClTensor");

  THAssert(THClTensor_checkGPU(state, 4, input, gradOutput, indices, gradInput));

  input = THClTensor_newContiguous(state, input);
  gradOutput = THClTensor_newContiguous(state, gradOutput);

  long nInputCols, nInputRows, nInputPlane, batchSize;
  long nOutputCols, nOutputRows;

  if (input->nDimension == 3) {
    nInputCols = input->size[2];
    nInputRows = input->size[1];
    nInputPlane = input->size[0];
    batchSize = 1;
  }
  else
  {
    nInputCols = input->size[3];
    nInputRows = input->size[2];
    nInputPlane = input->size[1];
    batchSize = input->size[0];
  }

  if(ceil_mode) {
    nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
    nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
  }
  else {
    nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
    nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
  }


  gradOutput = THClTensor_newContiguous(state, gradOutput);
  THClTensor_resizeAs(state, gradInput, input);
  
  int count = THClTensor_nElement(state, input);

  EasyCL *cl = input->storage->cl;
  std::string uniqueName = __FILE__ "MaxPoolBackward";
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernelBuilder.set("backward", 1);
    kernelBuilder.set("Dtype", "float");
    kernel = kernelBuilder.buildKernel(uniqueName, __FILE__,
      SpatialMaxPooling_getKernelTemplate(), "MaxPoolBackward");
  }

  THClKernels k(state, kernel);
  k.in((int)count);
  k.in(gradOutput);
  k.in(indices);
  k.in((int)batchSize);
  k.in((int)nInputPlane);
  k.in((int)nInputRows);
  k.in((int)nInputCols);
  k.in((int)nOutputRows);
  k.in((int)nOutputCols);
  k.in((int)kH);
  k.in((int)kW);
  k.in((int)dH);
  k.in((int)dW);
  k.in((int)padH);
  k.in((int)padW);
  k.out(gradInput);

  dim3 blocks(GET_BLOCKS(state, count));
  dim3 threads(GET_CL_NUM_THREADS(state));
  k.run(blocks, threads);

//  MaxPoolBackward <<< GET_BLOCKS(count), CL_NUM_THREADS, 0, THClState_getCurrentStream(state) >>> 
//      (count,
//      THClTensor_data(state, gradOutput),
//      THClTensor_data(state, indices),
//      batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
//      kH, kW, dH, dW, padH, padW,
//      THClTensor_data(state, gradInput));

  // clean
  THClTensor_free(state, input);
  THClTensor_free(state, gradOutput);

  return 1;
}
Пример #9
0
void THNN_ClSpatialAveragePooling_updateOutput(THClState *state, THClTensor *input, THClTensor *output, int kW, int kH, int dW, int dH, int padW, int padH, bool ceil_mode, bool count_include_pad)
{
  THAssert(THClTensor_checkGPU(state, 2, input, output));
  THArgCheck(input->nDimension == 3 || input->nDimension == 4, 2, "3D or 4D (batch) tensor expected");

  long nInputCols, nInputRows, nInputPlane, batchSize;
  long nOutputCols, nOutputRows;

  if (input->nDimension == 3) {
    nInputCols = input->size[2];
    nInputRows = input->size[1];
    nInputPlane = input->size[0];
    batchSize = 1;
  }
  else
  {
    nInputCols = input->size[3];
    nInputRows = input->size[2];
    nInputPlane = input->size[1];
    batchSize = input->size[0];
  }

  THArgCheck(nInputCols >= kW - 2*padW && nInputRows >= kH - 2*padH, 2, "input image smaller than kernel size");
  THArgCheck(kW/2 >= padW && kH/2 >= padH, 2, "pad should be smaller than half of kernel size");

  if(ceil_mode) {
    nOutputCols = ceil(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
    nOutputRows = ceil(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
  }
  else {
    nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
    nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
  }
  if (padW || padH)
  {
    // ensure that the last pooling starts inside the image
    // needed to avoid problems in ceil mode
    if ((nOutputRows - 1)*dH >= nInputRows + padH)
      --nOutputRows;
    if ((nOutputCols  - 1)*dW >= nInputCols  + padW)
      --nOutputCols;
  }

  input = THClTensor_newContiguous(state, input);
  // input_data = THClTensor_data(state, input);

  THClTensor_resize4d(state, output, batchSize, nInputPlane, nOutputRows, nOutputCols);
  
  // float* output_data = THClTensor_data(state, output);

  int count = THClTensor_nElement(state, output);

  EasyCL *cl = input->storage->cl;
  const char *uniqueName = 0;
  if(count_include_pad) {
    uniqueName = __FILE__ "forward_includepad";
  } else {
    uniqueName = __FILE__ "forward_not_includepad";
  }
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernelBuilder.set("forward", 1);
    kernelBuilder.set("Dtype", "float");
    kernelBuilder.set("COUNT_INCLUDE_PAD", count_include_pad);
    kernel = kernelBuilder.buildKernel(uniqueName, __FILE__,
      getKernelTemplate(), "AvePoolForward");
  }

  THClKernels k(state, kernel);
  k.in((int)count);
  k.in(input);
  k.in((int)batchSize);
  k.in((int)nInputPlane);
  k.in((int)nInputRows);
  k.in((int)nInputCols);
  k.in((int)nOutputRows);
  k.in((int)nOutputCols);
  k.in((int)kH);
  k.in((int)kW);
  k.in((int)dH);
  k.in((int)dW);
  k.in((int)padH);
  k.in((int)padW);
  k.out(output);

  dim3 blocks(GET_BLOCKS(state, count));
  dim3 threads(GET_CL_NUM_THREADS(state));
  k.run(blocks, threads);

//    AvePoolForward<float, true>
//      <<<GET_BLOCKS(count), CL_NUM_THREADS, 0, THClState_getCurrentStream(state) >>>(
//        count, input_data,
//        batchSize, nInputPlane, nInputRows, nInputCols, nOutputRows, nOutputCols,
//        kH, kW, dH, dW, padH, padW, output_data);

  if(input->nDimension == 3)
    THClTensor_resize3d(state, output, nInputPlane, nOutputRows, nOutputCols);

  THClTensor_free(state, input);
}