Exemple #1
0
TEST( testCopyBlock, basic ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    float a[] = { 1,2,3,4,
                  5,6,7,8,
                  9,10,11,12 };
    float b[10];
    memset(b, 0, sizeof(float)*10);

    CLKernel *kernel = makeBasicKernel( cl );
    kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (0<<10)|1)->in((2<<10)|3);
    kernel->localFloats( 2 * 3 );
    kernel->run_1d(12,4);
//    kernel->run_1d(12,12);
    cl->finish();
    float expected[] = { 2,3,4,
                         6,7,8 }; 
    for( int i = 0; i < 2; i++ ) {
        for( int j = 0; j < 3; j++ ) {
            cout << b[i*3+j] << " ";
            EXPECT_EQ( expected[i*3+j], b[i*3+j] );
        }
        cout << endl;
    }
    cout << endl;
    for( int i = 6; i < 10; i++ ) {
        cout << b[i] << " ";
        EXPECT_EQ( 0, b[i] );
    }
    cout << endl;
        cout << endl;

    kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (1<<10)|0)->in((2<<10)|3);
    kernel->localFloats( 2 * 3 );
//    kernel->run_1d(12,4);
    kernel->run_1d(12,4);
    cl->finish();
    float expected2[] = { 5,6,7,
                         9,10,11 }; 
    for( int i = 0; i < 2; i++ ) {
        for( int j = 0; j < 3; j++ ) {
            cout << b[i*3+j] << " ";
            EXPECT_EQ( expected2[i*3+j], b[i*3+j] );
        }
        cout << endl;
    }
    cout << endl;
    for( int i = 6; i < 10; i++ ) {
        cout << b[i] << " ";
        EXPECT_EQ( 0, b[i] );
    }
    cout << endl;
        cout << endl;

    delete kernel;
    delete cl;
}
Exemple #2
0
TEST(SLOW_testlocal, selfdot_3levels_withoutscratch) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testlocal.cl", "selfdot_ints_withoutscratch");
    int workgroupSize = min(512, cl->getMaxWorkgroupSize());
    const int numWorkgroups = workgroupSize;
    const int level3size = numWorkgroups / 4;
    const int N = workgroupSize * numWorkgroups * level3size;
    cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl;
    int *myarray = new int[N];
    for(int i = 0; i < N; i++) {
        myarray[i] = ((i + 7) * 3) % 5;
    }

//    Timer timer;

    CLWrapper *a1wrapper = cl->wrap(N, myarray);
    a1wrapper->copyToDevice();
//    timer.timeCheck("copied array to device");
    int *second = new int[N];
    CLWrapper *secondwrapper = cl->wrap(N, second);
    int *a2 = new int[numWorkgroups*level3size];
    CLWrapper *a2wrapper = cl->wrap(numWorkgroups * level3size, a2);
    kernel->in(a1wrapper);
    kernel->out(secondwrapper);
    kernel->out(a2wrapper);
    kernel->run_1d(N, workgroupSize);
    cl->finish();

    int *a3 = new int[numWorkgroups];
    CLWrapper *a3wrapper = cl->wrap(level3size, a3);
    kernel->in(a2wrapper);
    kernel->out(secondwrapper);
    kernel->out(a3wrapper);
    kernel->run_1d(workgroupSize * level3size, workgroupSize);
    cl->finish();

    int finalSum;
    kernel->in(a3wrapper);
    kernel->out(secondwrapper);
    kernel->out(1, &finalSum);
    kernel->run_1d(level3size, level3size);
//    timer.timeCheck("finished 3-level reduce");

    EXPECT_EQ(-1306309159, finalSum);

    delete a1wrapper;
    delete a2wrapper;
    delete a3wrapper;
    delete secondwrapper;
    delete[] a3;
    delete[] second;
    delete[] a2;
    delete[]myarray;
    delete kernel;
    delete cl;
}
Exemple #3
0
TEST(testqueues, defaultqueue) {
    cout << "start" << endl;
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", "");

    float in[5];
    float out[5];
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3;
        out[i] = 0;
    }
    kernel->input(5, in);
    kernel->output(5, out);
    size_t global = 5;
    size_t local = 5;
    kernel->run(1, &global, &local);
    cl->finish();
    assertEquals(out[0] , 7);
    assertEquals(out[1] , 10);
    assertEquals(out[2] , 13);
    assertEquals(out[3] , 16);
    assertEquals(out[4] , 19);

    for(int i = 0; i < 5; i++) {
        in[i] = i * 3;
        out[i] = 0;
    }

    cout << "cl->queue " << (void *)cl->queue << endl;
    cout << "*cl->queue " << (void *)*cl->queue << endl;
    cout << "cl->default_queue->queue" << (void *)cl->default_queue->queue << endl;
    kernel->input(5, in);
    kernel->output(5, out);
    kernel->run(cl->default_queue, 1, &global, &local);
    cl->finish();
    assertEquals(out[0] , 7);
    assertEquals(out[1] , 10);
    assertEquals(out[2] , 13);
    assertEquals(out[3] , 16);
    assertEquals(out[4] , 19);

    cout << "tests completed ok" << endl;
}
Exemple #4
0
TEST( testCopyLocal, basic ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    float a[] = { 1,2,3,4,
                  5,6,7,8,
                  9,10,11,12 };
    float b[16];
    memset(b, 0, sizeof(float)*16);

    CLKernel *kernel = makeKernel( cl );
    kernel->in( 12, a )->out( 16, b )->in( 12 );
    kernel->localFloats( 12 );
    kernel->run_1d(12,12);
    cl->finish();
    for( int i = 0; i < 3; i++ ) {
        for( int j = 0; j < 4; j++ ) {
            cout << b[i*4+j] << " ";
            EXPECT_EQ( i * 4 + j + 1, b[i*4+j] );
        }
        cout << endl;
    }
    cout << endl;
    for( int i = 12; i < 16; i++ ) {
        cout << b[i] << " ";
        EXPECT_EQ( 0, b[i] );
    }
    cout << endl;

    delete kernel;
    delete cl;
}
Exemple #5
0
// this runs an entire kernel to get one value.  Clearly this is going to be pretty slow, but
// at least it's more or less compatible, and comparable, to how cutorch does it
// lgfgs expects a working implementation of this method
float THClStorage_get(THClState *state, const THClStorage *self, long index)
{
////  printf("THClStorage_get\n");
  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
  THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty");

//  if( self->wrapper->isDeviceDirty() ) {
//    if(state->trace) cout << "wrapper->copyToHost()" << endl;
//    self->wrapper->copyToHost();
//  }
//  return self->data[index];

  const char *uniqueName = __FILE__ ":get";
  EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getGetKernelSource(), "THClStorageGet" );
  }

  float res;
  kernel->out(1, &res);
  kernel->in(self->wrapper);
  kernel->in((int64_t)index);
  kernel->run_1d(1, 1);

  if(state->addFinish) cl->finish();
  return res;
}
Exemple #6
0
// this runs an entire kernel to get one value.  Clearly this is going to be pretty slow, but
// at least it's more or less compatible, and comparable, to how cutorch does it
void THClStorage_set(THClState *state, THClStorage *self, long index, float value)
{
////  cout << "set size=" << self->size << " index=" << index << " value=" << value << endl;
  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
  THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty");
//  if( self->wrapper->isDeviceDirty() ) { // we have to do this, since we're going to copy it all back again
//                                         // although I suppose we could set via a kernel perhaps
//                                         // either way, this function is pretty inefficient right now :-P
//    if(state->trace) cout << "wrapper->copyToHost() size " << self->size << endl;
//    self->wrapper->copyToHost();
//  }
//  self->data[index] = value;
//  if(state->trace) cout << "wrapper->copyToDevice() size " << self->size << endl;
//  self->wrapper->copyToDevice();

  const char *uniqueName = __FILE__ ":set";
  EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getSetKernelSource(), "THClStorageSet" );
  }

  kernel->inout(self->wrapper);
  kernel->in((int64_t)index);
  kernel->in(value);
  kernel->run_1d(1, 1);

  if(state->addFinish) cl->finish();
}
Exemple #7
0
TEST(testClBlas, basic) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    float A[] = {1, 3,
                 2, 7,
                 9, 5};
    float B[] = {3,
                 -1};

    float C[3];
    ClBlasInstance clblasInstance;
    CLWrapper *AWrap = cl->wrap(6, A);
    CLWrapper *BWrap = cl->wrap(2, B);
    CLWrapper *CWrap = cl->wrap(3, C);
    AWrap->copyToDevice();
    BWrap->copyToDevice();
    CWrap->createOnDevice();
    ClBlasHelper::Gemm(
        cl,
        clblasRowMajor,
        clblasNoTrans, clblasNoTrans,
        3, 2, 1,
        1,
        AWrap, 0,
        BWrap, 0,
        0,
        CWrap, 0
    );
    cl->finish();
    CWrap->copyToHost();
    EXPECT_EQ(0, C[0]);
    EXPECT_EQ(-1, C[1]);
    EXPECT_EQ(22, C[2]);

    cl->finish();

    delete CWrap;
    delete BWrap;
    delete AWrap;

    cl->finish();

    delete cl;
    clblasTeardown();
}
Exemple #8
0
TEST( testMemset, basic ) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    CLKernel *kMemset = 0;
    // [[[cog
    // import stringify
    // stringify.write_kernel2( "kMemset", "cl/memset.cl", "cl_memset", '""' )
    // ]]]
    // generated using cog, from cl/memset.cl:
    const char * kMemsetSource =  
    "// Copyright Hugh Perkins 2015 hughperkins at gmail\n"
    "//\n"
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
    "// obtain one at http://mozilla.org/MPL/2.0/.\n"
    "\n"
    "kernel void cl_memset(global float *target, const float value, const int N) {\n"
    "    #define globalId get_global_id(0)\n"
    "    if ((int)globalId < N) {\n"
    "        target[globalId] = value;\n"
    "    }\n"
    "}\n"
    "\n"
    "";
    kMemset = cl->buildKernelFromString(kMemsetSource, "cl_memset", "", "cl/memset.cl");
    // [[[end]]]

    int N = 10000;
    float *myArray = new float[N];
    CLWrapper *myArrayWrapper = cl->wrap( N, myArray );
    myArrayWrapper->createOnDevice();
    kMemset->out( myArrayWrapper )->in( 99.0f )->in( N );
    int workgroupSize = 64;
    kMemset->run_1d( ( N + workgroupSize - 1 ) / workgroupSize * workgroupSize, workgroupSize );
    cl->finish();
    myArrayWrapper->copyToHost();
    for( int i = 0; i < 10; i++ ) {
//        cout << "myArray[" << i << "]=" << myArray[i] << endl;
    }
    for( int i = 0; i < N; i++ ) {
        EXPECT_EQ( 99.0f, myArray[i] );
    }

    delete kMemset;

    delete cl;
}
Exemple #9
0
TEST( testCopyBlock, testPos ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    float in[12];
    float res[12];
    in[0] = ( 3 << 10 ) | 4;
    in[1] = 8;
    in[2] = 14;

    for( int i = 0; i < 3; i++ ) {
        cout << "in[" << i << "]=" << in[i] << endl;
    }
    
    CLKernel *kernel = makeTestPosKernel( cl );
    kernel->in( 12, in )->out( 12, res );
    kernel->run_1d(1,1); 
    cl->finish();

    for( int i = 0; i < 5; i++ ) {
        cout << "res[" << i << "]=" << res[i] << endl;
    }
}
Exemple #10
0
void kernelLaunch_THClTensor_reduceNoncontigDim(
  THClState *state,
  dim3 &grid,
  dim3 &block,
  int ADims,
  int BDims,
  TensorInfo<IndexType> out,
  TensorInfo<IndexType> in,
  IndexType reductionStride,
  IndexType reductionSize,
  IndexType totalSlices,
  float init,
  HasOperator2 const*modifyOp,
  HasOperator3 const*reduceOp) {

//  cl->finish();
  StatefulTimer::timeCheck("Reduce-noncontig START");
  std::string uniqueName = "THClTensor_reduceNoncontigDim_" + easycl::toString(ADims) + "_" + easycl::toString(BDims) + "_" +
    TypeParseTraits<IndexType>::name + "_" + modifyOp->operator2() + "_" + reduceOp->operator3();
  EasyCL *cl = in.wrapper->getCl();
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
    StatefulTimer::timeCheck("Apply3 1aa");
  } else {
    // launch kernel here....
    TemplatedKernel kernelBuilder(cl);

    std::set<int> dims_set;
    if(ADims >= 0) {
      dims_set.insert(ADims);
    }
    if(BDims >= 0) {
      dims_set.insert(BDims);
    }
    std::vector<int> dims;
    for( std::set<int>::iterator it = dims_set.begin(); it != dims_set.end(); it++ ) {
      dims.push_back(*it);
    }

    std::string indexType = TypeParseTraits<IndexType>::name;
    kernelBuilder
      .set("include_THClReduceApplyUtils", THClReduceApplyUtils_getKernelTemplate())
      .set("dims", dims)
      .set("dim1", ADims)
      .set("dim2", BDims)
      .set("defreduceblock", 1)
      .set("WarpSize", 32) // probably can do like 'if nvidia 32 else 64' ?
      .set("MAX_CLTORCH_DIMS", MAX_CLTORCH_DIMS)
      .set("IndexType", indexType)
      .set("modify_operation", modifyOp->operator2())
      .set("reduce_operation", reduceOp->operator3())
    ;

    kernel = kernelBuilder.buildKernel(uniqueName, "THClReduce.cl", THClReduce_getKernelSource(), "THClTensor_reduceNoncontigDim");
  }

  THClKernels k(state, kernel);
  k.out(out);
  k.in(in);
  k.in((int)reductionStride);
  k.in((int)reductionSize);
  k.in((int)totalSlices);
  k.in(init);

  k.run(grid, block);

  if(state->addFinish) cl->finish();  
  StatefulTimer::timeCheck("Reduce-noncontig END");
}