Example #1
0
// this runs an entire kernel to get one value.  Clearly this is going to be pretty slow, but
// at least it's more or less compatible, and comparable, to how cutorch does it
void THClStorage_set(THClState *state, THClStorage *self, long index, float value)
{
////  cout << "set size=" << self->size << " index=" << index << " value=" << value << endl;
  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
  THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty");
//  if( self->wrapper->isDeviceDirty() ) { // we have to do this, since we're going to copy it all back again
//                                         // although I suppose we could set via a kernel perhaps
//                                         // either way, this function is pretty inefficient right now :-P
//    if(state->trace) cout << "wrapper->copyToHost() size " << self->size << endl;
//    self->wrapper->copyToHost();
//  }
//  self->data[index] = value;
//  if(state->trace) cout << "wrapper->copyToDevice() size " << self->size << endl;
//  self->wrapper->copyToDevice();

  const char *uniqueName = __FILE__ ":set";
  EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getSetKernelSource(), "THClStorageSet" );
  }

  kernel->inout(self->wrapper);
  kernel->in((int64_t)index);
  kernel->in(value);
  kernel->run_1d(1, 1);

  if(state->addFinish) cl->finish();
}
Example #2
0
TEST( SLOW_testintwrapper_huge, testread ) {
    Timer timer;
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_read");
//    const int N = 4500000;
//    const int N = (4500000/512)*512;
    int N = 100000;
    int *out = new int[N];
    CLWrapper *outwrapper = cl->wrap(N, out);
    kernel->in(3)->in(7);
    kernel->output( outwrapper );
    int globalSize = N;
    int workgroupsize = cl->getMaxWorkgroupSize();
    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
    cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl;
    timer.timeCheck("before kernel");
    kernel->run_1d( globalSize, workgroupsize );
    timer.timeCheck("after kernel");
    outwrapper->copyToHost();
    timer.timeCheck("after copy to host");
    for( int i = 0; i < N; i++ ) {
       if( out[i] != 4228 ) {
           cout << "out[" << i << "] != 4228: " << out[i] << endl;
           exit(-1);
       }
    }

    delete outwrapper;
    delete kernel;
    delete cl;
}
TEST(test_scenario_te42kyfo, main) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", "");
    CLArrayFloat *out = cl->arrayFloat(5);
    CLArrayFloat *in = cl->arrayFloat(5);
    for(int i = 0; i < 5; i++) {
       (*out)[i] = 0;
    }
    for(int i = 0; i < 100; i++) {
      for(int n = 0; n < 5; n++) {
        (*in)[n] = i*n;
      }
      kernel->in(in);
      kernel->out(out);
      kernel->run_1d(5, 5);
      assertEquals(i*2 + 5, (*out)[2]);
      assertEquals(i*4 + 5, (*out)[4]);
    }
    cout << "finished" << endl;

    delete in;
    delete out;
    delete kernel;
    delete cl;
}
Example #4
0
TEST(testinout, main) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", "");
    float inout[5];
    for(int i = 0; i < 5; i++) {
        inout[i] = i * 3;
    }
    kernel->inout(5, inout);
    size_t global = 5;
    size_t local = 5;
    kernel->run(1, &global, &local);
    assertEquals(inout[0] , 7);
    assertEquals(inout[1] , 10);
    assertEquals(inout[2] , 13);
    assertEquals(inout[3] , 16);
    assertEquals(inout[4] , 19);
    cout << "tests completed ok" << endl;

    delete kernel;
    delete cl;
}
Example #5
0
TEST( SLOW_testintwrapper_huge, testreadwrite ) {

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_stress");
    const int N = 1000000;
    int *in = new int[N];
    for( int i = 0; i < N; i++ ) {
        in[i] = i * 3;
    }
    int *out = new int[N];
    CLWrapper *inwrapper = cl->wrap(N, in);
    CLWrapper *outwrapper = cl->wrap(N, out);
    inwrapper->copyToDevice();
    outwrapper->createOnDevice();
    kernel->input( inwrapper );
    kernel->output( outwrapper );
    int globalSize = N;
    int workgroupsize = cl->getMaxWorkgroupSize();
    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
    cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl;
    kernel->run_1d( globalSize, workgroupsize );
    outwrapper->copyToHost();
    for( int i = 0; i < N; i++ ) {
       if( out[i] != 689514 ) {
           cout << "out[" << i << "] != 689514: " << out[i] << endl;
           exit(-1);
       }
    }

    delete outwrapper;
    delete inwrapper;
    delete kernel;
    delete cl;
}
Example #6
0
// this runs an entire kernel to get one value.  Clearly this is going to be pretty slow, but
// at least it's more or less compatible, and comparable, to how cutorch does it
// lgfgs expects a working implementation of this method
float THClStorage_get(THClState *state, const THClStorage *self, long index)
{
////  printf("THClStorage_get\n");
  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
  THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty");

//  if( self->wrapper->isDeviceDirty() ) {
//    if(state->trace) cout << "wrapper->copyToHost()" << endl;
//    self->wrapper->copyToHost();
//  }
//  return self->data[index];

  const char *uniqueName = __FILE__ ":get";
  EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getGetKernelSource(), "THClStorageGet" );
  }

  float res;
  kernel->out(1, &res);
  kernel->in(self->wrapper);
  kernel->in((int64_t)index);
  kernel->run_1d(1, 1);

  if(state->addFinish) cl->finish();
  return res;
}
Example #7
0
TEST(testCLMathWrapper, multiplyinplace) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    float adat[] = { 1,3,9,12.5f,2.5f };
//    float bdat[] = { 4,2.1f, 5,3,9.2f };
    CLWrapper *a_ = cl->wrap(5,adat);
//    CLWrapper *b_ = cl->wrap(5,bdat);
    a_->copyToDevice();
//    b_->copyToDevice();

    CLMathWrapper a(a_);
//    CLMathWrapper b(b_);
//    a += b;
    a *= 1.5f;
    a_->copyToHost();

    for(int i = 0; i < 5; i++) {
        cout << "a[" << i << "]=" << adat[i] << endl;
    }
    EXPECT_FLOAT_NEAR(1.5f, adat[0]);
    EXPECT_FLOAT_NEAR(4.5f, adat[1]);
    EXPECT_FLOAT_NEAR(3.75f, adat[4]);

//    delete a;
//    delete b;
    delete a_;
//    delete b_;
    delete cl;
}
TEST(testfloatwrapperconst, main) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", "");
    float in[5];
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3;
    }
    float out[5];
    CLWrapper *inwrapper = cl->wrap(5, (float const *)in);
    CLWrapper *outwrapper = cl->wrap(5, out);
    inwrapper->copyToDevice();
    kernel->input(inwrapper);
    kernel->output(outwrapper);
    kernel->run_1d(5, 5);
    outwrapper->copyToHost();
    assertEquals(out[0] , 7);
    assertEquals(out[1] , 10);
    assertEquals(out[2] , 13);
    assertEquals(out[3] , 16);
    assertEquals(out[4] , 19);
    cout << "tests completed ok" << endl;

    delete inwrapper;
    delete outwrapper;
    delete kernel;
    delete cl;
}
Example #9
0
TEST( testforward, compare_1_n_biased_pad ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    int maxWorkgroupSize = cl->getMaxWorkgroupSize();
    delete cl;

    LayerDimensions dim;
    int batchSize = 4;
    int N = 4;
    string activationName = "tanh";
    dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 )
        .setFilterSize( 5 )
        .setPadZeros( true ).setBiased( true );
    for( int instance = 2; instance <= 7; instance++ ) {
        if( instance == 5 ) {
            continue; // forwardfc, cant use for inputimagesize != filtersize
        }
        dim.setInputSize(19);
        if(instance == 2 && maxWorkgroupSize < 19 * 19) {
            dim.setInputSize(15);
        }
        if(instance == 3 && maxWorkgroupSize < 19 * 19) {
            dim.setInputSize(15);
        }
        cout << "instance: " << instance << endl;
        compareSpecific( false, N, batchSize, dim, 1, instance );
    }
}
Example #10
0
TEST(testcopybuffer, throwsifnotondevice) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test");
    const int bufferSize = 100 * 1024 / 4;
    float *in = new float[bufferSize];
    float *in2 = new float[bufferSize];
    for(int i = 0; i < bufferSize; i++) {
        in[i] = i * 3;
        in2[i] = 23 + i;
    }
    CLWrapper *inwrapper = cl->wrap(bufferSize, in);
    CLWrapper *in2wrapper = cl->wrap(bufferSize, in2);
    inwrapper->copyToDevice();
//    in2wrapper->copyToDevice();

    bool threw = false;
    try {
        inwrapper->copyTo(in2wrapper);
    } catch(runtime_error &e) {
        threw = true;
    }
    EXPECT_TRUE(threw);
    delete cl;
}
Example #11
0
TEST( testCopyLocal, basic ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    float a[] = { 1,2,3,4,
                  5,6,7,8,
                  9,10,11,12 };
    float b[16];
    memset(b, 0, sizeof(float)*16);

    CLKernel *kernel = makeKernel( cl );
    kernel->in( 12, a )->out( 16, b )->in( 12 );
    kernel->localFloats( 12 );
    kernel->run_1d(12,12);
    cl->finish();
    for( int i = 0; i < 3; i++ ) {
        for( int j = 0; j < 4; j++ ) {
            cout << b[i*4+j] << " ";
            EXPECT_EQ( i * 4 + j + 1, b[i*4+j] );
        }
        cout << endl;
    }
    cout << endl;
    for( int i = 12; i < 16; i++ ) {
        cout << b[i] << " ";
        EXPECT_EQ( 0, b[i] );
    }
    cout << endl;

    delete kernel;
    delete cl;
}
Example #12
0
TEST(testcopybuffer, withoffset) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test");
    float in[10];
    float in2[10];
    for(int i = 0; i < 10; i++) {
        in[i] = i * 3;
        in2[i] = 23 + i;
    }
    float out[10];
    CLWrapper *inwrapper = cl->wrap(10, in);
    CLWrapper *in2wrapper = cl->wrap(10, in2);
    CLWrapper *outwrapper = cl->wrap(10, out);
    inwrapper->copyToDevice();
    in2wrapper->copyToDevice();

    EXPECT_FALSE(in2wrapper->isDeviceDirty());
    inwrapper->copyTo(in2wrapper, 2, 5, 4);
    EXPECT_TRUE(in2wrapper->isDeviceDirty());
  //  cl->finish();
    // check that in2 host-side unchanged:
    for(int i = 0; i < 10; i++) {
        in[i] = i * 3;
        EXPECT_EQ(23 + i, in2[i]);
    }

    in2wrapper->copyToHost();
    // check that in2 is now a partial copy of in:
    for(int i = 0; i < 10; i++) {
//        in[i] = i * 3;
        if(i >= 5 && i < 9) {
          EXPECT_EQ((i-3) * 3, in2[i]);
        } else {
          EXPECT_EQ(23 + i, in2[i]);
        }
    }

    // check that modifying in2 doesnt modfiy in:
    in2[1] = 27;
    in2wrapper->copyToDevice();
    inwrapper->copyToHost();
    EXPECT_EQ(1 * 3, in[1]);

    in2wrapper->copyToHost();
    EXPECT_EQ(1 * 3, in[1]);
    EXPECT_EQ(27, in2[1]);
    
    delete inwrapper;
    delete in2wrapper;
    delete outwrapper;
    delete cl;
}
Example #13
0
TEST( testCopyBlock, basic ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    float a[] = { 1,2,3,4,
                  5,6,7,8,
                  9,10,11,12 };
    float b[10];
    memset(b, 0, sizeof(float)*10);

    CLKernel *kernel = makeBasicKernel( cl );
    kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (0<<10)|1)->in((2<<10)|3);
    kernel->localFloats( 2 * 3 );
    kernel->run_1d(12,4);
//    kernel->run_1d(12,12);
    cl->finish();
    float expected[] = { 2,3,4,
                         6,7,8 }; 
    for( int i = 0; i < 2; i++ ) {
        for( int j = 0; j < 3; j++ ) {
            cout << b[i*3+j] << " ";
            EXPECT_EQ( expected[i*3+j], b[i*3+j] );
        }
        cout << endl;
    }
    cout << endl;
    for( int i = 6; i < 10; i++ ) {
        cout << b[i] << " ";
        EXPECT_EQ( 0, b[i] );
    }
    cout << endl;
        cout << endl;

    kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (1<<10)|0)->in((2<<10)|3);
    kernel->localFloats( 2 * 3 );
//    kernel->run_1d(12,4);
    kernel->run_1d(12,4);
    cl->finish();
    float expected2[] = { 5,6,7,
                         9,10,11 }; 
    for( int i = 0; i < 2; i++ ) {
        for( int j = 0; j < 3; j++ ) {
            cout << b[i*3+j] << " ";
            EXPECT_EQ( expected2[i*3+j], b[i*3+j] );
        }
        cout << endl;
    }
    cout << endl;
    for( int i = 6; i < 10; i++ ) {
        cout << b[i] << " ";
        EXPECT_EQ( 0, b[i] );
    }
    cout << endl;
        cout << endl;

    delete kernel;
    delete cl;
}
Example #14
0
TEST(testlocal, reduceviascratch_multipleworkgroups_ints) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups_ints");
    int workgroupSize = min(512, cl->getMaxWorkgroupSize());
    const int numWorkgroups = workgroupSize;
    const int N = workgroupSize * numWorkgroups;
    cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl;
    int *myarray = new int[N];
    int sumViaCpu = 0;
    int localSumViaCpu = 0;
    int localSumViaCpu2 = 0;
    int *localSumsViaCpu = new int[numWorkgroups];
    memset(localSumsViaCpu, 0, sizeof(int)*numWorkgroups);
    for(int i = 0; i < N; i++) {
        myarray[i] = ((i + 7) * 3) % 50;
        sumViaCpu += myarray[i];
        if(i < workgroupSize) {
            localSumViaCpu += myarray[i];
        }
        if(i >= workgroupSize && i < workgroupSize * 2) {
            localSumViaCpu2 += myarray[i];
        }
        int workgroupId = i / workgroupSize;
        localSumsViaCpu[workgroupId] += myarray[i];
    }
    ASSERT_EQ(localSumViaCpu, localSumsViaCpu[0]);
    ASSERT_EQ(localSumViaCpu2, localSumsViaCpu[1]);
    ASSERT_NE(myarray[0], sumViaCpu);

//    Timer timer;

    CLWrapper *a1wrapper = cl->wrap(N, myarray);
    a1wrapper->copyToDevice();
    int *a2 = new int[numWorkgroups];
    CLWrapper *a2wrapper = cl->wrap(numWorkgroups, a2);
    kernel->in(a1wrapper);
    kernel->out(a2wrapper);
    kernel->localInts(workgroupSize);
    kernel->run_1d(N, workgroupSize);

    int finalSum;
    kernel->in(a2wrapper);
    kernel->out(1, &finalSum);
    kernel->localInts(workgroupSize);
    kernel->run_1d(numWorkgroups, workgroupSize);
//    timer.timeCheck("finished 2-way reduce");

    EXPECT_EQ(sumViaCpu, finalSum);

    delete a1wrapper;
    delete a2wrapper;
    delete[] a2;
    delete[]myarray;
    delete kernel;
    delete cl;
}
Example #15
0
TEST(testcopybuffer, main) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test");
    float in[5];
    float in2[5];
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3.0f;
        in2[i] = 23.0f + i;
    }
    float out[5];
    CLWrapper *inwrapper = cl->wrap(5, in);
    CLWrapper *in2wrapper = cl->wrap(5, in2);
    CLWrapper *outwrapper = cl->wrap(5, out);
    inwrapper->copyToDevice();
    in2wrapper->copyToDevice();

    EXPECT_FALSE(in2wrapper->isDeviceDirty());
    inwrapper->copyTo(in2wrapper);
    EXPECT_TRUE(in2wrapper->isDeviceDirty());
  //  cl->finish();
    // check that in2 host-side unchanged:
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3.0f;
        EXPECT_EQ(23.0f + i, in2[i]);
    }

    in2wrapper->copyToHost();
    // check that in2 is now a copy of in:
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3.0f;
        EXPECT_EQ(i * 3.0f, in2[i]);
    }

    // check that modifying in2 doesnt modfiy in:
    in2[1] = 27;
    in2wrapper->copyToDevice();
    inwrapper->copyToHost();
    EXPECT_EQ(1 * 3.0f, in[1]);

    in2wrapper->copyToHost();
    EXPECT_EQ(1 * 3.0f, in[1]);
    EXPECT_EQ(27.0f, in2[1]);
    
    delete inwrapper;
    delete in2wrapper;
    delete outwrapper;
    delete cl;
}
Example #16
0
TEST( testdefines, simple ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testdefines.cl", "testDefines", 
        "-D DOUBLE -D SOME_VALUE=5" );
    float out[32];
    kernel->out( 32, out );
    kernel->run_1d( 32, 32 );
    EXPECT_EQ( 10, out[3] );

    delete kernel;
    delete cl;
}
Example #17
0
TEST(SLOW_testcopybuffer, larger) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test");
    const int bufferSize = 100 * 1024 * 1024 / 4; // 100MB (of floats)
    float *in = new float[bufferSize];
    float *in2 = new float[bufferSize];
    for(int i = 0; i < bufferSize; i++) {
        in[i] = i * 3;
        in2[i] = 23 + i;
    }
    CLWrapper *inwrapper = cl->wrap(bufferSize, in);
    CLWrapper *in2wrapper = cl->wrap(bufferSize, in2);
    inwrapper->copyToDevice();
    in2wrapper->copyToDevice();

    inwrapper->copyTo(in2wrapper);
//    cl->finish();
    // check that in2 host-side unchanged:
    for(int i = 0; i < bufferSize; i++) {
        in[i] = i * 3;
        EXPECT_EQ(23 + i, in2[i]);
    }

    in2wrapper->copyToHost();
    // check that in2 is now a copy of in:
    for(int i = 0; i < bufferSize; i++) {
        in[i] = i * 3;
        EXPECT_EQ(i * 3, in2[i]);
    }

    // check that modifying in2 doesnt modfiy in:
    in2[1] = 27;
    in2wrapper->copyToDevice();
    inwrapper->copyToHost();
    EXPECT_EQ(1 * 3, in[1]);

    in2wrapper->copyToHost();
    EXPECT_EQ(1 * 3, in[1]);
    EXPECT_EQ(27, in2[1]);
    
    delete inwrapper;
    delete in2wrapper;
    delete[] in;
    delete[] in2;
    delete cl;
}
Example #18
0
TEST(testClBlas, colMajor2) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    float A[] = {1, 3,
                 2, 7,
                 9, 5,
                 0, -2};
    float B[] = {3,2,8,
                 -1,0,4};

    float C[4*3];
    transpose(A, 4, 2);
    transpose(B, 2, 3);
//    for(int row=0; row < 2; row++) {
//        for(int col=0; col < 1; col++) {
//            cout << B[row*1 + col] << " ";
//        }
//        cout << endl;
//    }
    ClBlasInstance clblasInstance;
//    ClBlasInstance::initializeIfNecessary();
    CLWrapper *AWrap = cl->wrap(4*2, A);
    CLWrapper *BWrap = cl->wrap(2*3, B);
    CLWrapper *CWrap = cl->wrap(4*3, C);
    AWrap->copyToDevice();
    BWrap->copyToDevice();
    ClBlasHelper::Gemm(
        cl,
        clblasColumnMajor,
        clblasNoTrans, clblasNoTrans,
        4, 2, 3,
        1,
        AWrap, 0,
        BWrap, 0,
        0,
        CWrap, 0
    );
//    cl->finish();
    CWrap->copyToHost();
    transpose(C, 3, 4);
    EXPECT_EQ(1*3-1*3, C[0]);
    EXPECT_EQ(1*2+3*0, C[1]);
    EXPECT_EQ(1*8+4*3, C[2]);
    EXPECT_EQ(-8, C[11]);

    delete CWrap;
    delete BWrap;
    delete AWrap;

    delete cl;
}
Example #19
0
TEST(SLOW_testlocal, selfdot_3levels_withoutscratch) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testlocal.cl", "selfdot_ints_withoutscratch");
    int workgroupSize = min(512, cl->getMaxWorkgroupSize());
    const int numWorkgroups = workgroupSize;
    const int level3size = numWorkgroups / 4;
    const int N = workgroupSize * numWorkgroups * level3size;
    cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl;
    int *myarray = new int[N];
    for(int i = 0; i < N; i++) {
        myarray[i] = ((i + 7) * 3) % 5;
    }

//    Timer timer;

    CLWrapper *a1wrapper = cl->wrap(N, myarray);
    a1wrapper->copyToDevice();
//    timer.timeCheck("copied array to device");
    int *second = new int[N];
    CLWrapper *secondwrapper = cl->wrap(N, second);
    int *a2 = new int[numWorkgroups*level3size];
    CLWrapper *a2wrapper = cl->wrap(numWorkgroups * level3size, a2);
    kernel->in(a1wrapper);
    kernel->out(secondwrapper);
    kernel->out(a2wrapper);
    kernel->run_1d(N, workgroupSize);
    cl->finish();

    int *a3 = new int[numWorkgroups];
    CLWrapper *a3wrapper = cl->wrap(level3size, a3);
    kernel->in(a2wrapper);
    kernel->out(secondwrapper);
    kernel->out(a3wrapper);
    kernel->run_1d(workgroupSize * level3size, workgroupSize);
    cl->finish();

    int finalSum;
    kernel->in(a3wrapper);
    kernel->out(secondwrapper);
    kernel->out(1, &finalSum);
    kernel->run_1d(level3size, level3size);
//    timer.timeCheck("finished 3-level reduce");

    EXPECT_EQ(-1306309159, finalSum);

    delete a1wrapper;
    delete a2wrapper;
    delete a3wrapper;
    delete secondwrapper;
    delete[] a3;
    delete[] second;
    delete[] a2;
    delete[]myarray;
    delete kernel;
    delete cl;
}
Example #20
0
TEST(testlocal, notUselocal) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testlocal.cl", "notUseLocal");
    int workgroupSize = 64;
    float *myarray = new float[workgroupSize];

    kernel->in(workgroupSize);
    kernel->inout(workgroupSize, myarray);
    kernel->run_1d(workgroupSize, workgroupSize);

    delete[]myarray;
    delete kernel;
    delete cl;
}
Example #21
0
TEST(testqueues, defaultqueue) {
    cout << "start" << endl;
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", "");

    float in[5];
    float out[5];
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3;
        out[i] = 0;
    }
    kernel->input(5, in);
    kernel->output(5, out);
    size_t global = 5;
    size_t local = 5;
    kernel->run(1, &global, &local);
    cl->finish();
    assertEquals(out[0] , 7);
    assertEquals(out[1] , 10);
    assertEquals(out[2] , 13);
    assertEquals(out[3] , 16);
    assertEquals(out[4] , 19);

    for(int i = 0; i < 5; i++) {
        in[i] = i * 3;
        out[i] = 0;
    }

    cout << "cl->queue " << (void *)cl->queue << endl;
    cout << "*cl->queue " << (void *)*cl->queue << endl;
    cout << "cl->default_queue->queue" << (void *)cl->default_queue->queue << endl;
    kernel->input(5, in);
    kernel->output(5, out);
    kernel->run(cl->default_queue, 1, &global, &local);
    cl->finish();
    assertEquals(out[0] , 7);
    assertEquals(out[1] , 10);
    assertEquals(out[2] , 13);
    assertEquals(out[3] , 16);
    assertEquals(out[4] , 19);

    cout << "tests completed ok" << endl;
}
Example #22
0
TEST(testClBlas, colMajorTransB) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    float A[] = {1, 3,
                 2, 7,
                 9, 5};
    float B[] = {3,
                 -1};

    float C[3];
    transpose(A, 3, 2);
//    transpose(B, 2, 1);
//    for(int row=0; row < 2; row++) {
//        for(int col=0; col < 1; col++) {
//            cout << B[row*1 + col] << " ";
//        }
//        cout << endl;
//    }
    ClBlasInstance clblasInstance;
//    ClBlasInstance::initializeIfNecessary();
    CLWrapper *AWrap = cl->wrap(6, A);
    CLWrapper *BWrap = cl->wrap(2, B);
    CLWrapper *CWrap = cl->wrap(3, C);
    AWrap->copyToDevice();
    BWrap->copyToDevice();
    ClBlasHelper::Gemm(
        cl,
        clblasColumnMajor,
        clblasNoTrans, clblasTrans,
        3, 2, 1,
        1,
        AWrap, 0,
        BWrap, 0,
        0,
        CWrap, 0
    );
//    cl->finish();
    CWrap->copyToHost();
    transpose(C, 1, 3);
    EXPECT_EQ(0, C[0]);
    EXPECT_EQ(-1, C[1]);
    EXPECT_EQ(22, C[2]);

    delete CWrap;
    delete BWrap;
    delete AWrap;

    delete cl;
}
Example #23
0
void measurePerf(int instance, int batchSize, LayerDimensions dim) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    int inputNumElements = dim.inputCubeSize * batchSize;
    int errorsSize = dim.outputCubeSize * batchSize;
    int weightsSize = dim.filtersSize;
    int errorsForUpstreamSize = dim.inputCubeSize * batchSize;
    float *input = new float[inputNumElements];
    float *errors = new float[errorsSize];
    float *weights = new float[weightsSize];

    WeightRandomizer::randomize(input, inputNumElements, -0.1f, 0.1f);
    WeightRandomizer::randomize(errors, errorsSize, -0.1f, 0.1f);
    WeightRandomizer::randomize(weights, weightsSize, -0.1f, 0.1f);

    float *errorsForUpstream = new float[errorsForUpstreamSize];
    CLWrapper *inputWrapper = cl->wrap(inputNumElements, input);
    CLWrapper *errorsWrapper = cl->wrap(errorsSize, errors);
    CLWrapper *weightsWrapper = cl->wrap(weightsSize, weights);
    CLWrapper *errorsForUpstreamWrapper = cl->wrap(errorsForUpstreamSize, errorsForUpstream);
    inputWrapper->copyToDevice();
    errorsWrapper->copyToDevice();
    weightsWrapper->copyToDevice();
    errorsForUpstreamWrapper->createOnDevice();

    StatefulTimer::timeCheck("after init");
    Backward *backwardImpl = Backward::instanceSpecific(instance, cl, dim);
    for(int it = 0; it < 40; it++) {
        backwardImpl->backward(batchSize, 
            inputWrapper, errorsWrapper, weightsWrapper,
            errorsForUpstreamWrapper);
    }
    StatefulTimer::timeCheck("after backprop");
    StatefulTimer::dump(true);

    delete errorsForUpstreamWrapper;
    delete weightsWrapper;
    delete inputWrapper;
    delete errorsWrapper;

    delete[] errors;
    delete[] weights;
    delete[] input;
    delete[] errorsForUpstream;

    delete backwardImpl;
    delete cl;
}
Example #24
0
TEST( testMemset, basic ) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    CLKernel *kMemset = 0;
    // [[[cog
    // import stringify
    // stringify.write_kernel2( "kMemset", "cl/memset.cl", "cl_memset", '""' )
    // ]]]
    // generated using cog, from cl/memset.cl:
    const char * kMemsetSource =  
    "// Copyright Hugh Perkins 2015 hughperkins at gmail\n"
    "//\n"
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
    "// obtain one at http://mozilla.org/MPL/2.0/.\n"
    "\n"
    "kernel void cl_memset(global float *target, const float value, const int N) {\n"
    "    #define globalId get_global_id(0)\n"
    "    if ((int)globalId < N) {\n"
    "        target[globalId] = value;\n"
    "    }\n"
    "}\n"
    "\n"
    "";
    kMemset = cl->buildKernelFromString(kMemsetSource, "cl_memset", "", "cl/memset.cl");
    // [[[end]]]

    int N = 10000;
    float *myArray = new float[N];
    CLWrapper *myArrayWrapper = cl->wrap( N, myArray );
    myArrayWrapper->createOnDevice();
    kMemset->out( myArrayWrapper )->in( 99.0f )->in( N );
    int workgroupSize = 64;
    kMemset->run_1d( ( N + workgroupSize - 1 ) / workgroupSize * workgroupSize, workgroupSize );
    cl->finish();
    myArrayWrapper->copyToHost();
    for( int i = 0; i < 10; i++ ) {
//        cout << "myArray[" << i << "]=" << myArray[i] << endl;
    }
    for( int i = 0; i < N; i++ ) {
        EXPECT_EQ( 99.0f, myArray[i] );
    }

    delete kMemset;

    delete cl;
}
Example #25
0
void col2im(THClState *state, THClTensor* col, const int channels,
    const int height, const int width, const int patch_h, const int patch_w, const int pad_h,
    const int pad_w, const int stride_h, const int stride_w, THClTensor* im) {
  int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1;
  int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1;
  int num_kernels = channels * height * width;
  // To avoid involving atomic operations, we will launch one kernel per
  // bottom dimension, and then in the kernel add up the top dimensions.

  EasyCL *cl = im->storage->cl;
  std::string uniqueName = "SpatialConvolutionMM::col2im";
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel(uniqueName, "SpatialConvolutionMM.cl",
      SpatialConvolutionMM_getKernelTemplate(), "col2im_kernel");
  }

  THClKernels k(state, kernel);
  k.in(num_kernels);
  k.in(col);
  k.in(height);
  k.in(width);
  k.in(channels);

  k.in(patch_h);
  k.in(patch_w);
  k.in(pad_h);
  k.in(pad_w);
  k.in(stride_h);
  k.in(stride_w);

  k.in(height_col);
  k.in(width_col);
  k.out(im);

  k.run(GET_BLOCKS(state, num_kernels), getNumThreads(state));

//  col2im_kernel <<<GET_BLOCKS(num_kernels), CL_NUM_THREADS, 0, stream>>> (
//      num_kernels, data_col, height, width, channels,
//      patch_h, patch_w, pad_h, pad_w, stride_h, stride_w,
//      height_col, width_col, data_im
//  );
//  THError("Not implemented");
}
Example #26
0
TEST( testpoolingforward, fromwrappers ) {
    int batchSize = 1;
    int numPlanes = 1;
    int imageSize = 4;
    int poolingSize = 2;
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    PoolingForward *poolingForward = PoolingForward::instanceSpecific( 1, cl, false, numPlanes, imageSize, poolingSize );
    float input[] = { 1, 2, 5, 3,
                     3, 8, 4, 1,
                     3, 33, 14,23,
                     -1, -3.5f,37.4f,5
    };
    int outputSize = poolingForward->getOutputSize( batchSize );
    int *selectors = new int[outputSize];
    float *output = new float[outputSize];

    const int inputSize = batchSize * numPlanes * imageSize * imageSize;
    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
    CLWrapper *selectorsWrapper = cl->wrap( outputSize, selectors );
    CLWrapper *outputWrapper = cl->wrap( outputSize, output );

    inputWrapper->copyToDevice();

    poolingForward->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper );

    selectorsWrapper->copyToHost();
    outputWrapper->copyToHost();

    EXPECT_EQ( selectors[0], 3 );
    EXPECT_EQ( selectors[1], 0 );
    EXPECT_EQ( selectors[2], 1 );
    EXPECT_EQ( selectors[3], 2 );

    EXPECT_EQ( output[0], 8 );
    EXPECT_EQ( output[1], 5 );
    EXPECT_EQ( output[2], 33 );
    EXPECT_EQ( output[3], 37.4f );

    delete inputWrapper;
    delete selectorsWrapper;
    delete outputWrapper;
    delete poolingForward;
    delete[] selectors;
    delete[] output;
    delete cl;
}
Example #27
0
void THClStorage_resize(THClState *state, THClStorage *self, long size)
{
  StatefulTimer::timeCheck("THClStorage_resize START");
  if( size <= self->size ) {
    return;
  }
  delete self->wrapper;
  if(state->trace && self->size > 0) cout << "delete wrapper" << endl;
  delete[] self->data;
  self->data = new float[size];
  EasyCL *cl = self->cl;
  self->wrapper = cl->wrap( size, self->data );
  self->wrapper->createOnDevice();
    if(state->trace) cout << "new wrapper, size " << size << endl;
  self->size = size;
  StatefulTimer::timeCheck("THClStorage_resize END");
}
Example #28
0
TEST(testlocal, reduceviascratch_multipleworkgroups) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups");
    int workgroupSize = min(512, cl->getMaxWorkgroupSize());
    const int numWorkgroups = workgroupSize;
    const int N = workgroupSize * numWorkgroups;
    float *myarray = new float[N];
    float sumViaCpu = 0;
    float localSumViaCpu = 0;
    for(int i = 0; i < N; i++) {
        myarray[i] = ((i + 7) * 3) % 10;
        sumViaCpu += myarray[i];
        if(i < workgroupSize) {
            localSumViaCpu += myarray[i];
        }
    }
    cout << "expected sum, calc'd via cpu, : " << sumViaCpu << endl;
    EXPECT_NE(myarray[0], sumViaCpu);

//    Timer timer;

    CLWrapper *a1wrapper = cl->wrap(N, myarray);
    a1wrapper->copyToDevice();
    float *a2 = new float[numWorkgroups];
    CLWrapper *a2wrapper = cl->wrap(numWorkgroups, a2);
    kernel->in(a1wrapper);
    kernel->out(a2wrapper);
    kernel->localFloats(workgroupSize);
    kernel->run_1d(N, workgroupSize);

    float finalSum;
    kernel->in(a2wrapper);
    kernel->out(1, &finalSum);
    kernel->localFloats(workgroupSize);
    kernel->run_1d(numWorkgroups, workgroupSize);

    EXPECT_EQ(sumViaCpu, finalSum);

    delete a1wrapper;
    delete a2wrapper;
    delete[] a2;
    delete[]myarray;
    delete kernel;
    delete cl;
}
Example #29
0
TEST(testfloatarray, main) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", "");
    float in[5];
    float inout[5];
    float out[5];
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3;
        inout[i] = i * 3;
    }
    kernel->in(5, in);
    kernel->out(5, out);
    kernel->inout(5, inout);
    kernel->run_1d(5, 5);

    for(int i = 0; i < 5; i++) {
        cout << out[i] << " ";
    }
    cout << endl;
    for(int i = 0; i < 5; i++) {
        cout << inout[i] << " ";
    }
    cout << endl;

    assertEquals(inout[0], 7);
    assertEquals(inout[1] , 10);
    assertEquals(inout[2] , 34);
    assertEquals(inout[3] , 16);
    assertEquals(inout[4], 19);
    assertEquals(out[0] , 5);
    assertEquals(out[1] , 8);
    assertEquals(out[2] , 26);
    assertEquals(out[3] , 14);
    assertEquals(out[4] , 17);
    cout << "tests completed ok" << endl;

    delete kernel;
    delete cl;
}
Example #30
0
void im2col(THClState *state, THClTensor* im, const int channels,
    const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h,
    const int pad_w, const int stride_h, const int stride_w, THClTensor* col) {
  // We are going to launch channels * height_col * width_col kernels, each
  // kernel responsible for copying a single-channel grid.
  int height_col = (height + 2 * pad_h - ksize_h) / stride_h + 1;
  int width_col = (width + 2 * pad_w - ksize_w) / stride_w + 1;
  int num_kernels = channels * height_col * width_col;

  std::string uniqueName = "SpatialConvolutionMM::im2col";
  EasyCL *cl = im->storage->cl;
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel(uniqueName, "SpatialConvolutionMM.cl",
      SpatialConvolutionMM_getKernelTemplate(), "im2col_kernel");
  }

  THClKernels k(state, kernel);
  k.in(num_kernels);
  k.in(im);
  k.in(height);
  k.in(width);
  k.in(ksize_h);
  k.in(ksize_w);
  k.in(pad_h);
  k.in(pad_w);
  k.in(stride_h);
  k.in(stride_w);
  k.in(height_col);
  k.in(width_col);
  k.out(col);

  k.run(GET_BLOCKS(state, num_kernels), getNumThreads(state));

  // Launch
//  im2col_kernel <<<GET_BLOCKS(num_kernels), CL_NUM_THREADS, 0, stream>>> (
//      num_kernels, data_im, height, width, ksize_h, ksize_w,
//      pad_h, pad_w, stride_h, stride_w,
//      height_col, width_col, data_col
//  );
}