Example #1
0
TEST( SLOW_testintwrapper_huge, testreadwrite ) {

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_stress");
    const int N = 1000000;
    int *in = new int[N];
    for( int i = 0; i < N; i++ ) {
        in[i] = i * 3;
    }
    int *out = new int[N];
    CLWrapper *inwrapper = cl->wrap(N, in);
    CLWrapper *outwrapper = cl->wrap(N, out);
    inwrapper->copyToDevice();
    outwrapper->createOnDevice();
    kernel->input( inwrapper );
    kernel->output( outwrapper );
    int globalSize = N;
    int workgroupsize = cl->getMaxWorkgroupSize();
    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
    cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl;
    kernel->run_1d( globalSize, workgroupsize );
    outwrapper->copyToHost();
    for( int i = 0; i < N; i++ ) {
       if( out[i] != 689514 ) {
           cout << "out[" << i << "] != 689514: " << out[i] << endl;
           exit(-1);
       }
    }

    delete outwrapper;
    delete inwrapper;
    delete kernel;
    delete cl;
}
Example #2
0
VIRTUAL void Adagrad::updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
        AdagradState *trainerState) {

    int numWeights = trainerState->numWeights;
    float *working = new float[ numWeights ];
    CLWrapper *workingWrapper = cl->wrap(numWeights, working);
    workingWrapper->createOnDevice();

    CLMathWrapper clWeights(weightsWrapper);
    CLMathWrapper clGradWeights(gradWeightsWrapper);
    CLMathWrapper clSumSquares(trainerState->sumSquaresWrapper);
    CLMathWrapper clWorking(workingWrapper);

    // following all happens on gpu, via clmathwrapper:
    clWorking = clGradWeights;
    clWorking.squared();
    clSumSquares += clWorking;

    clWorking = clSumSquares;
    clWorking.sqrt();
    clWorking.inv();
    clWorking *= clGradWeights;
    clWorking *= - learningRate;
    clWeights += clWorking;

    delete workingWrapper;
    delete[] working;
}
Example #3
0
void CopyBuffer::copy( EasyCL *cl, CLWrapper *sourceWrapper, int *target ) {
    // first we will copy it to another buffer, so we can copy it out
    int bufferSize = sourceWrapper->size();
//    float *copiedBuffer = new float[ bufferSize ];
    CLWrapper *targetWrapper = cl->wrap( bufferSize, target );
    targetWrapper->createOnDevice();

    // now copy it, via a kernel
    const string kernelSource = "\n"
        "kernel void copy( int N, global int const *source, global int *dest ) {\n"
          "  #define globalId ( get_global_id(0) )\n"
          "  if( (int)globalId < N ) {\n"
          "      dest[globalId] = source[globalId];\n"
          "  }\n"
       " }\n";
    CLKernel *kernel = cl->buildKernelFromString( kernelSource, "copy", "" );
    kernel->in( bufferSize )->in( sourceWrapper )->out( targetWrapper );
    int workgroupSize = 32;
    int numWorkgroups = ( bufferSize + workgroupSize - 1 ) / workgroupSize;
    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
    cl->finish();
    targetWrapper->copyToHost();

    delete targetWrapper;
    delete kernel;
//    delete[] copiedBuffer;
}
Example #4
0
THClStorage* THClStorage_newWithSize(THClState *state, int device, long size)
{
  THArgCheck(size >= 0, 2, "invalid size");

  if(size > 0)
  {
    StatefulTimer::timeCheck("THClStorage_newWithSize START");
    THClStorage *storage = (THClStorage*)THAlloc(sizeof(THClStorage));
    float *data = new float[size];
    storage->device = device;
    storage->cl = THClState_getClv2(state, storage->device);
    CLWrapper *wrapper = storage->cl->wrap( size, data );
    if(state->trace) cout << "new wrapper, size " << size << endl;
    if(state->trace) cout << "wrapper->createOnDevice()" << endl;
    wrapper->createOnDevice();
    storage->data = data;
    storage->wrapper = wrapper;

    storage->size = size;
    storage->refcount = 1;
    storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM;
    StatefulTimer::timeCheck("THClStorage_newWithSize END");
    return storage;
  }
  else
  {
    return THClStorage_newv2(state, device);
  }
}
Example #5
0
void measurePerf(int instance, int batchSize, LayerDimensions dim) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    int inputNumElements = dim.inputCubeSize * batchSize;
    int errorsSize = dim.outputCubeSize * batchSize;
    int weightsSize = dim.filtersSize;
    int errorsForUpstreamSize = dim.inputCubeSize * batchSize;
    float *input = new float[inputNumElements];
    float *errors = new float[errorsSize];
    float *weights = new float[weightsSize];

    WeightRandomizer::randomize(input, inputNumElements, -0.1f, 0.1f);
    WeightRandomizer::randomize(errors, errorsSize, -0.1f, 0.1f);
    WeightRandomizer::randomize(weights, weightsSize, -0.1f, 0.1f);

    float *errorsForUpstream = new float[errorsForUpstreamSize];
    CLWrapper *inputWrapper = cl->wrap(inputNumElements, input);
    CLWrapper *errorsWrapper = cl->wrap(errorsSize, errors);
    CLWrapper *weightsWrapper = cl->wrap(weightsSize, weights);
    CLWrapper *errorsForUpstreamWrapper = cl->wrap(errorsForUpstreamSize, errorsForUpstream);
    inputWrapper->copyToDevice();
    errorsWrapper->copyToDevice();
    weightsWrapper->copyToDevice();
    errorsForUpstreamWrapper->createOnDevice();

    StatefulTimer::timeCheck("after init");
    Backward *backwardImpl = Backward::instanceSpecific(instance, cl, dim);
    for(int it = 0; it < 40; it++) {
        backwardImpl->backward(batchSize, 
            inputWrapper, errorsWrapper, weightsWrapper,
            errorsForUpstreamWrapper);
    }
    StatefulTimer::timeCheck("after backprop");
    StatefulTimer::dump(true);

    delete errorsForUpstreamWrapper;
    delete weightsWrapper;
    delete inputWrapper;
    delete errorsWrapper;

    delete[] errors;
    delete[] weights;
    delete[] input;
    delete[] errorsForUpstream;

    delete backwardImpl;
    delete cl;
}
Example #6
0
TEST( testMemset, basic ) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    CLKernel *kMemset = 0;
    // [[[cog
    // import stringify
    // stringify.write_kernel2( "kMemset", "cl/memset.cl", "cl_memset", '""' )
    // ]]]
    // generated using cog, from cl/memset.cl:
    const char * kMemsetSource =  
    "// Copyright Hugh Perkins 2015 hughperkins at gmail\n"
    "//\n"
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
    "// obtain one at http://mozilla.org/MPL/2.0/.\n"
    "\n"
    "kernel void cl_memset(global float *target, const float value, const int N) {\n"
    "    #define globalId get_global_id(0)\n"
    "    if ((int)globalId < N) {\n"
    "        target[globalId] = value;\n"
    "    }\n"
    "}\n"
    "\n"
    "";
    kMemset = cl->buildKernelFromString(kMemsetSource, "cl_memset", "", "cl/memset.cl");
    // [[[end]]]

    int N = 10000;
    float *myArray = new float[N];
    CLWrapper *myArrayWrapper = cl->wrap( N, myArray );
    myArrayWrapper->createOnDevice();
    kMemset->out( myArrayWrapper )->in( 99.0f )->in( N );
    int workgroupSize = 64;
    kMemset->run_1d( ( N + workgroupSize - 1 ) / workgroupSize * workgroupSize, workgroupSize );
    cl->finish();
    myArrayWrapper->copyToHost();
    for( int i = 0; i < 10; i++ ) {
//        cout << "myArray[" << i << "]=" << myArray[i] << endl;
    }
    for( int i = 0; i < N; i++ ) {
        EXPECT_EQ( 99.0f, myArray[i] );
    }

    delete kMemset;

    delete cl;
}
Example #7
0
TEST(testClBlas, basic) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    float A[] = {1, 3,
                 2, 7,
                 9, 5};
    float B[] = {3,
                 -1};

    float C[3];
    ClBlasInstance clblasInstance;
    CLWrapper *AWrap = cl->wrap(6, A);
    CLWrapper *BWrap = cl->wrap(2, B);
    CLWrapper *CWrap = cl->wrap(3, C);
    AWrap->copyToDevice();
    BWrap->copyToDevice();
    CWrap->createOnDevice();
    ClBlasHelper::Gemm(
        cl,
        clblasRowMajor,
        clblasNoTrans, clblasNoTrans,
        3, 2, 1,
        1,
        AWrap, 0,
        BWrap, 0,
        0,
        CWrap, 0
    );
    cl->finish();
    CWrap->copyToHost();
    EXPECT_EQ(0, C[0]);
    EXPECT_EQ(-1, C[1]);
    EXPECT_EQ(22, C[2]);

    cl->finish();

    delete CWrap;
    delete BWrap;
    delete AWrap;

    cl->finish();

    delete cl;
    clblasTeardown();
}
Example #8
0
VIRTUAL void Adadelta::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper,
        AdadeltaState *trainerState ) {
    // need to calculate
    // sumGradSquared = decay * sumGradSquared + (1 - decay ) * grad.square()
    // update = - sumUpdateSquared.sqrt() / sumGradSquared.sqrt() * grad
    // sumUpdateSquared = decay * sumUpdateSquared + ( 1 - decay ) * update.squared()
    // weights += update

    int numWeights = trainerState->numWeights;
    float *working = new float[ numWeights ];
    CLWrapper *workingWrapper = cl->wrap( numWeights, working );
    workingWrapper->createOnDevice();

    CLMathWrapper clWeights( weightsWrapper );
    CLMathWrapper clGradWeights( gradWeightsWrapper );
    CLMathWrapper clSumGradSquared( trainerState->sumGradSquaredWrapper );
    CLMathWrapper clSumUpdateSquared( trainerState->sumUpdateSquaredWrapper );
    CLMathWrapper clWorking( workingWrapper );

    // following all happens on gpu, via clmathwrapper:
    clWorking = clGradWeights;
    clWorking.squared();
    clWorking *= ( 1 - decay );
    clSumGradSquared *= decay;
    clSumGradSquared += clWorking;

    clWorking = clSumGradSquared;
    clWorking.inv();
    clWorking *= clSumUpdateSquared;
    clWorking.sqrt();
    clWorking *= clGradWeights;
    clWorking *= - 1;

    clWeights += clWorking;

    clSumUpdateSquared *= decay;
    clWorking.squared();
    clWorking *= ( 1 - decay );
    clSumUpdateSquared += clWorking;

    delete workingWrapper;
    delete[] working;
}
Example #9
0
float THClBlas_dot(THClState *state, long n, 
    CLWrapper *xwrapper, long xoffset, long incx, 
    CLWrapper *ywrapper, long yoffset, long incy)
{
  if(n == 1)
  {
    incx = 1;
    incy = 1;
  }

  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
  {
    int i_n = (int)n;
    int i_incx = (int)incx;
    int i_incy = (int)incy;
    float result;

    cl_int err;

    err = clblasSetup();
    if (err != CL_SUCCESS) {
        THError("clblasSetup() failed with %d", err);
    }
    
    CLWrapper *resultWrapper = state->cl->wrap( 1, &result );
    float *scratch = new float[i_n];
    CLWrapper *scratchWrapper = state->cl->wrap(i_n, scratch);
    scratchWrapper->createOnDevice();
    resultWrapper->createOnDevice();

    cl_event event = NULL;
    err = clblasSdot( i_n, resultWrapper->getBuffer(), 0, 
          xwrapper->getBuffer(), xoffset, i_incx, 
          ywrapper->getBuffer(), yoffset, i_incy, 
          scratchWrapper->getBuffer(),
          1, state->cl->queue, 0, NULL, &event);
//    THCublasCheck(cublasSdot(*state->blasState->current_handle, i_n, x, i_incx, y, i_incy, &result));
//    ClDeviceSynchronize();
    if (err != CL_SUCCESS) {
        THError("clblasSdot() failed with %d", err);
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);
    }
    resultWrapper->copyToHost();

    /* Finalize work with clblas. */
    clblasTeardown();


    delete resultWrapper;
    delete scratchWrapper;
    delete[] scratch;
    return result;
  }
  THError("Cublas_dot only supports n, incx and incy "
          "upto signed integer limits: %d", INT_MAX);
    THError("Not implemented");
  return -1;
}
Example #10
0
void THClBlas_gemm2(THClState *state, char orderchar, char transa, char transb, long m, long n, long k, float alpha, 
  THClTensor *a, long lda, THClTensor *b, long ldb, float beta, 
  THClTensor *c, long ldc)
{
  StatefulTimer::timeCheck("THClBlas_gemm START");
  CLWrapper *aWrapper = THClTensor_wrapper(state, a);
  CLWrapper *bWrapper = THClTensor_wrapper(state, b);
  CLWrapper *cWrapper = THClTensor_wrapper(state, c);
  long offseta = THClTensor_storageOffset(state, a);
  long offsetb = THClTensor_storageOffset(state, b);
  long offsetc = THClTensor_storageOffset(state, c);

//  adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc);
  clblasTranspose opa = convertTransToClblasOperation(transa);
  clblasTranspose opb = convertTransToClblasOperation(transb);

  clblasOrder order = orderchar == 'c' ? clblasColumnMajor : clblasRowMajor;

  if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX)  && (ldb <= INT_MAX) && (ldc <= INT_MAX) )
  {
    int i_m = (int)m;
    int i_n = (int)n;
    int i_k = (int)k;
    int i_lda = (int)lda;
    int i_ldb = (int)ldb;
    int i_ldc = (int)ldc;

    cl_int err;

    if( !aWrapper->isOnDevice() ) {
      aWrapper->createOnDevice();
    }
    if( !bWrapper->isOnDevice() ) {
      bWrapper->createOnDevice();
    }
    if( !cWrapper->isOnDevice() ) {
      cWrapper->createOnDevice();
    }

    EasyCL *cl = cWrapper->getCl();
    cl_event *event = 0;
    if(state->addFinish) {
      event = new cl_event();
    }
    err = clblasSgemm(order, opa, opb, i_m, i_n, i_k,
                         alpha, aWrapper->getBuffer(), offseta, i_lda,
                         bWrapper->getBuffer(), offsetb, i_ldb, beta,
                         cWrapper->getBuffer(), offsetc, i_ldc,
                         1, cl->queue, 0, NULL, event);
    if (err != CL_SUCCESS) {
        THError("clblasSgemm() failed with %d", err);
    }
    else {
      if(state->addFinish) {
        err = clWaitForEvents(1, event);
        if (err != CL_SUCCESS) {
//              throw runtime_error("clblasSger() failed with " + easycl::toString(err));
          THError("clblasSger: wait for event failed with %d", err);
        }
        clReleaseEvent(*event);
        delete event;
      }
    }
    cWrapper->markDeviceDirty();

    StatefulTimer::timeCheck("THClBlas_gemm END");
    return;
  }
  THError("Clblas_gemm only supports m, n, k, lda, ldb, ldc"
          "with the bound [val] <= %d", INT_MAX);
}
Example #11
0
void THClBlas_ger(THClState *state, long m, long n, float alpha, 
    THClTensor *x, long incx,
    THClTensor *y, long incy,
    THClTensor *a, long lda)
{
  StatefulTimer::timeCheck("THClBlas_ger START");

  if(n == 1)
    lda = m;

  if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX)  && (incx <= INT_MAX) && (incy <= INT_MAX) )
    {
      int i_m = (int)m;
      int i_n = (int)n;
      int i_lda = (int)lda;
      int i_incx = (int)incx;
      int i_incy = (int)incy;

      cl_int err;

      CLWrapper *xwrap = THClTensor_wrapper(state, x);
      CLWrapper *ywrap = THClTensor_wrapper(state, y);
      CLWrapper *awrap = THClTensor_wrapper(state, a);
      long x_offset = THClTensor_storageOffset(state, x);
      long y_offset = THClTensor_storageOffset(state, y);
      long a_offset = THClTensor_storageOffset(state, a);

      if(!awrap->isOnDevice()) {
        awrap->createOnDevice();
      }

      EasyCL *cl = ywrap->getCl();
      cl_event *event = 0;
      if(state->addFinish) {
        event = new cl_event();
      }
      err = clblasSger(clblasColumnMajor, i_m, i_n, alpha,
                       xwrap->getBuffer(), x_offset, i_incx,
                       ywrap->getBuffer(), y_offset, i_incy,
                       awrap->getBuffer(), a_offset, i_lda,
                       1, (cl->queue), 0, NULL, event);
      if (err != CL_SUCCESS) {
//        throw runtime_error("clblasSger() failed with " + easycl::toString(err));
        THError("clblasSger() failed with %d", err);
      }
      else {
        if(state->addFinish) {
          err = clWaitForEvents(1, event);
          if (err != CL_SUCCESS) {
//              throw runtime_error("clblasSger() failed with " + easycl::toString(err));
            THError("clblasSger: wait for event failed with %d", err);
          }
          clReleaseEvent(*event);
          delete event;
        }
      }
      awrap->markDeviceDirty();

      StatefulTimer::timeCheck("THClBlas_ger END");
      return;
    }
  THError("Cublas_ger only supports m, n, lda, incx, incy"
          "with the bound [val] <= %d", INT_MAX);
}
Example #12
0
float THClBlas_dot(THClState *state, long n, 
    CLWrapper *xwrapper, long xoffset, long incx, 
    CLWrapper *ywrapper, long yoffset, long incy)
{
  StatefulTimer::timeCheck("THClBlas_dot START");
  if(n == 1)
  {
    incx = 1;
    incy = 1;
  }

  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
  {
    int i_n = (int)n;
    int i_incx = (int)incx;
    int i_incy = (int)incy;
    float result;

    cl_int err;

    EasyCL *cl = ywrapper->getCl();
    CLWrapper *resultWrapper = cl->wrap( 1, &result );
    float *scratch = new float[i_n];
    CLWrapper *scratchWrapper = cl->wrap(i_n, scratch);
    scratchWrapper->createOnDevice();
    resultWrapper->createOnDevice();

    cl_event *event = 0;
//    if(state->addFinish) {
      event = new cl_event();
//    }
    err = clblasSdot( i_n, resultWrapper->getBuffer(), 0, 
          xwrapper->getBuffer(), xoffset, i_incx, 
          ywrapper->getBuffer(), yoffset, i_incy, 
          scratchWrapper->getBuffer(),
          1, cl->queue, 0, NULL, event);
    if (err != CL_SUCCESS) {
        THError("clblasSdot() failed with %d", err);
    }
    else {
//      if(state->addFinish) {
        err = clWaitForEvents(1, event);
        if (err != CL_SUCCESS) {
//              throw runtime_error("clblasSger() failed with " + easycl::toString(err));
          THError("clblasSger: wait for event failed with %d", err);
        }
        clReleaseEvent(*event);
        delete event;
//      }
    }
    // TODO: NOT copy to host here, use pointtensor
    resultWrapper->copyToHost();
    //resultWrapper->markDeviceDirty();

    delete resultWrapper;
    delete scratchWrapper;
    delete[] scratch;
    StatefulTimer::timeCheck("THClBlas_dot END");
    return result;
  }
  THError("Cublas_dot only supports n, incx and incy "
          "upto signed integer limits: %d", INT_MAX);
    THError("Not implemented");
  return -1;
}