Beispiel #1
0
// this runs an entire kernel to get one value.  Clearly this is going to be pretty slow, but
// at least it's more or less compatible, and comparable, to how cutorch does it
// lgfgs expects a working implementation of this method
float THClStorage_get(THClState *state, const THClStorage *self, long index)
{
////  printf("THClStorage_get\n");
  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
  THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty");

//  if( self->wrapper->isDeviceDirty() ) {
//    if(state->trace) cout << "wrapper->copyToHost()" << endl;
//    self->wrapper->copyToHost();
//  }
//  return self->data[index];

  const char *uniqueName = __FILE__ ":get";
  EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getGetKernelSource(), "THClStorageGet" );
  }

  float res;
  kernel->out(1, &res);
  kernel->in(self->wrapper);
  kernel->in((int64_t)index);
  kernel->run_1d(1, 1);

  if(state->addFinish) cl->finish();
  return res;
}
Beispiel #2
0
// this runs an entire kernel to get one value.  Clearly this is going to be pretty slow, but
// at least it's more or less compatible, and comparable, to how cutorch does it
void THClStorage_set(THClState *state, THClStorage *self, long index, float value)
{
////  cout << "set size=" << self->size << " index=" << index << " value=" << value << endl;
  THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds");
  THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty");
//  if( self->wrapper->isDeviceDirty() ) { // we have to do this, since we're going to copy it all back again
//                                         // although I suppose we could set via a kernel perhaps
//                                         // either way, this function is pretty inefficient right now :-P
//    if(state->trace) cout << "wrapper->copyToHost() size " << self->size << endl;
//    self->wrapper->copyToHost();
//  }
//  self->data[index] = value;
//  if(state->trace) cout << "wrapper->copyToDevice() size " << self->size << endl;
//  self->wrapper->copyToDevice();

  const char *uniqueName = __FILE__ ":set";
  EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P
  CLKernel *kernel = 0;
  if(cl->kernelExists(uniqueName)) {
    kernel = cl->getKernel(uniqueName);
  } else {
    TemplatedKernel kernelBuilder(cl);
    kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getSetKernelSource(), "THClStorageSet" );
  }

  kernel->inout(self->wrapper);
  kernel->in((int64_t)index);
  kernel->in(value);
  kernel->run_1d(1, 1);

  if(state->addFinish) cl->finish();
}
Beispiel #3
0
TEST( testCopyBlock, basic ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    float a[] = { 1,2,3,4,
                  5,6,7,8,
                  9,10,11,12 };
    float b[10];
    memset(b, 0, sizeof(float)*10);

    CLKernel *kernel = makeBasicKernel( cl );
    kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (0<<10)|1)->in((2<<10)|3);
    kernel->localFloats( 2 * 3 );
    kernel->run_1d(12,4);
//    kernel->run_1d(12,12);
    cl->finish();
    float expected[] = { 2,3,4,
                         6,7,8 }; 
    for( int i = 0; i < 2; i++ ) {
        for( int j = 0; j < 3; j++ ) {
            cout << b[i*3+j] << " ";
            EXPECT_EQ( expected[i*3+j], b[i*3+j] );
        }
        cout << endl;
    }
    cout << endl;
    for( int i = 6; i < 10; i++ ) {
        cout << b[i] << " ";
        EXPECT_EQ( 0, b[i] );
    }
    cout << endl;
        cout << endl;

    kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (1<<10)|0)->in((2<<10)|3);
    kernel->localFloats( 2 * 3 );
//    kernel->run_1d(12,4);
    kernel->run_1d(12,4);
    cl->finish();
    float expected2[] = { 5,6,7,
                         9,10,11 }; 
    for( int i = 0; i < 2; i++ ) {
        for( int j = 0; j < 3; j++ ) {
            cout << b[i*3+j] << " ";
            EXPECT_EQ( expected2[i*3+j], b[i*3+j] );
        }
        cout << endl;
    }
    cout << endl;
    for( int i = 6; i < 10; i++ ) {
        cout << b[i] << " ";
        EXPECT_EQ( 0, b[i] );
    }
    cout << endl;
        cout << endl;

    delete kernel;
    delete cl;
}
Beispiel #4
0
TEST(SLOW_testlocal, selfdot_3levels_withoutscratch) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testlocal.cl", "selfdot_ints_withoutscratch");
    int workgroupSize = min(512, cl->getMaxWorkgroupSize());
    const int numWorkgroups = workgroupSize;
    const int level3size = numWorkgroups / 4;
    const int N = workgroupSize * numWorkgroups * level3size;
    cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl;
    int *myarray = new int[N];
    for(int i = 0; i < N; i++) {
        myarray[i] = ((i + 7) * 3) % 5;
    }

//    Timer timer;

    CLWrapper *a1wrapper = cl->wrap(N, myarray);
    a1wrapper->copyToDevice();
//    timer.timeCheck("copied array to device");
    int *second = new int[N];
    CLWrapper *secondwrapper = cl->wrap(N, second);
    int *a2 = new int[numWorkgroups*level3size];
    CLWrapper *a2wrapper = cl->wrap(numWorkgroups * level3size, a2);
    kernel->in(a1wrapper);
    kernel->out(secondwrapper);
    kernel->out(a2wrapper);
    kernel->run_1d(N, workgroupSize);
    cl->finish();

    int *a3 = new int[numWorkgroups];
    CLWrapper *a3wrapper = cl->wrap(level3size, a3);
    kernel->in(a2wrapper);
    kernel->out(secondwrapper);
    kernel->out(a3wrapper);
    kernel->run_1d(workgroupSize * level3size, workgroupSize);
    cl->finish();

    int finalSum;
    kernel->in(a3wrapper);
    kernel->out(secondwrapper);
    kernel->out(1, &finalSum);
    kernel->run_1d(level3size, level3size);
//    timer.timeCheck("finished 3-level reduce");

    EXPECT_EQ(-1306309159, finalSum);

    delete a1wrapper;
    delete a2wrapper;
    delete a3wrapper;
    delete secondwrapper;
    delete[] a3;
    delete[] second;
    delete[] a2;
    delete[]myarray;
    delete kernel;
    delete cl;
}
Beispiel #5
0
TEST(testlocal, reduceviascratch_multipleworkgroups_ints) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups_ints");
    int workgroupSize = min(512, cl->getMaxWorkgroupSize());
    const int numWorkgroups = workgroupSize;
    const int N = workgroupSize * numWorkgroups;
    cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl;
    int *myarray = new int[N];
    int sumViaCpu = 0;
    int localSumViaCpu = 0;
    int localSumViaCpu2 = 0;
    int *localSumsViaCpu = new int[numWorkgroups];
    memset(localSumsViaCpu, 0, sizeof(int)*numWorkgroups);
    for(int i = 0; i < N; i++) {
        myarray[i] = ((i + 7) * 3) % 50;
        sumViaCpu += myarray[i];
        if(i < workgroupSize) {
            localSumViaCpu += myarray[i];
        }
        if(i >= workgroupSize && i < workgroupSize * 2) {
            localSumViaCpu2 += myarray[i];
        }
        int workgroupId = i / workgroupSize;
        localSumsViaCpu[workgroupId] += myarray[i];
    }
    ASSERT_EQ(localSumViaCpu, localSumsViaCpu[0]);
    ASSERT_EQ(localSumViaCpu2, localSumsViaCpu[1]);
    ASSERT_NE(myarray[0], sumViaCpu);

//    Timer timer;

    CLWrapper *a1wrapper = cl->wrap(N, myarray);
    a1wrapper->copyToDevice();
    int *a2 = new int[numWorkgroups];
    CLWrapper *a2wrapper = cl->wrap(numWorkgroups, a2);
    kernel->in(a1wrapper);
    kernel->out(a2wrapper);
    kernel->localInts(workgroupSize);
    kernel->run_1d(N, workgroupSize);

    int finalSum;
    kernel->in(a2wrapper);
    kernel->out(1, &finalSum);
    kernel->localInts(workgroupSize);
    kernel->run_1d(numWorkgroups, workgroupSize);
//    timer.timeCheck("finished 2-way reduce");

    EXPECT_EQ(sumViaCpu, finalSum);

    delete a1wrapper;
    delete a2wrapper;
    delete[] a2;
    delete[]myarray;
    delete kernel;
    delete cl;
}
TEST(test_scenario_te42kyfo, main) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", "");
    CLArrayFloat *out = cl->arrayFloat(5);
    CLArrayFloat *in = cl->arrayFloat(5);
    for(int i = 0; i < 5; i++) {
       (*out)[i] = 0;
    }
    for(int i = 0; i < 100; i++) {
      for(int n = 0; n < 5; n++) {
        (*in)[n] = i*n;
      }
      kernel->in(in);
      kernel->out(out);
      kernel->run_1d(5, 5);
      assertEquals(i*2 + 5, (*out)[2]);
      assertEquals(i*4 + 5, (*out)[4]);
    }
    cout << "finished" << endl;

    delete in;
    delete out;
    delete kernel;
    delete cl;
}
Beispiel #7
0
TEST( SLOW_testintwrapper_huge, testread ) {
    Timer timer;
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_read");
//    const int N = 4500000;
//    const int N = (4500000/512)*512;
    int N = 100000;
    int *out = new int[N];
    CLWrapper *outwrapper = cl->wrap(N, out);
    kernel->in(3)->in(7);
    kernel->output( outwrapper );
    int globalSize = N;
    int workgroupsize = cl->getMaxWorkgroupSize();
    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
    cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl;
    timer.timeCheck("before kernel");
    kernel->run_1d( globalSize, workgroupsize );
    timer.timeCheck("after kernel");
    outwrapper->copyToHost();
    timer.timeCheck("after copy to host");
    for( int i = 0; i < N; i++ ) {
       if( out[i] != 4228 ) {
           cout << "out[" << i << "] != 4228: " << out[i] << endl;
           exit(-1);
       }
    }

    delete outwrapper;
    delete kernel;
    delete cl;
}
Beispiel #8
0
TEST( testCopyLocal, basic ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    float a[] = { 1,2,3,4,
                  5,6,7,8,
                  9,10,11,12 };
    float b[16];
    memset(b, 0, sizeof(float)*16);

    CLKernel *kernel = makeKernel( cl );
    kernel->in( 12, a )->out( 16, b )->in( 12 );
    kernel->localFloats( 12 );
    kernel->run_1d(12,12);
    cl->finish();
    for( int i = 0; i < 3; i++ ) {
        for( int j = 0; j < 4; j++ ) {
            cout << b[i*4+j] << " ";
            EXPECT_EQ( i * 4 + j + 1, b[i*4+j] );
        }
        cout << endl;
    }
    cout << endl;
    for( int i = 12; i < 16; i++ ) {
        cout << b[i] << " ";
        EXPECT_EQ( 0, b[i] );
    }
    cout << endl;

    delete kernel;
    delete cl;
}
Beispiel #9
0
void CopyBuffer::copy( EasyCL *cl, CLWrapper *sourceWrapper, int *target ) {
    // first we will copy it to another buffer, so we can copy it out
    int bufferSize = sourceWrapper->size();
//    float *copiedBuffer = new float[ bufferSize ];
    CLWrapper *targetWrapper = cl->wrap( bufferSize, target );
    targetWrapper->createOnDevice();

    // now copy it, via a kernel
    const string kernelSource = "\n"
        "kernel void copy( int N, global int const *source, global int *dest ) {\n"
          "  #define globalId ( get_global_id(0) )\n"
          "  if( (int)globalId < N ) {\n"
          "      dest[globalId] = source[globalId];\n"
          "  }\n"
       " }\n";
    CLKernel *kernel = cl->buildKernelFromString( kernelSource, "copy", "" );
    kernel->in( bufferSize )->in( sourceWrapper )->out( targetWrapper );
    int workgroupSize = 32;
    int numWorkgroups = ( bufferSize + workgroupSize - 1 ) / workgroupSize;
    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
    cl->finish();
    targetWrapper->copyToHost();

    delete targetWrapper;
    delete kernel;
//    delete[] copiedBuffer;
}
Beispiel #10
0
TEST(testlocal, reduceviascratch_multipleworkgroups) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups");
    int workgroupSize = min(512, cl->getMaxWorkgroupSize());
    const int numWorkgroups = workgroupSize;
    const int N = workgroupSize * numWorkgroups;
    float *myarray = new float[N];
    float sumViaCpu = 0;
    float localSumViaCpu = 0;
    for(int i = 0; i < N; i++) {
        myarray[i] = ((i + 7) * 3) % 10;
        sumViaCpu += myarray[i];
        if(i < workgroupSize) {
            localSumViaCpu += myarray[i];
        }
    }
    cout << "expected sum, calc'd via cpu, : " << sumViaCpu << endl;
    EXPECT_NE(myarray[0], sumViaCpu);

//    Timer timer;

    CLWrapper *a1wrapper = cl->wrap(N, myarray);
    a1wrapper->copyToDevice();
    float *a2 = new float[numWorkgroups];
    CLWrapper *a2wrapper = cl->wrap(numWorkgroups, a2);
    kernel->in(a1wrapper);
    kernel->out(a2wrapper);
    kernel->localFloats(workgroupSize);
    kernel->run_1d(N, workgroupSize);

    float finalSum;
    kernel->in(a2wrapper);
    kernel->out(1, &finalSum);
    kernel->localFloats(workgroupSize);
    kernel->run_1d(numWorkgroups, workgroupSize);

    EXPECT_EQ(sumViaCpu, finalSum);

    delete a1wrapper;
    delete a2wrapper;
    delete[] a2;
    delete[]myarray;
    delete kernel;
    delete cl;
}
Beispiel #11
0
VIRTUAL void GpuOp::apply2_inplace( int N, CLWrapper*destinationWrapper, CLWrapper *deltaWrapper, Op2 *op ) {
    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start" );

    string kernelName = "GpuOp::" + op->getName() + "_inplace";
    if( !cl->kernelExists( kernelName ) ) {
        buildKernel( kernelName, op, true );
    }
    CLKernel *kernel = cl->getKernel( kernelName );

    kernel->in( N );
    kernel->inout( destinationWrapper );
    kernel->in( deltaWrapper );
    int globalSize = N;
    int workgroupSize = 64;
    int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize;
    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
    cl->finish();

    StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end" );
}
Beispiel #12
0
TEST(testlocal, notUselocal) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testlocal.cl", "notUseLocal");
    int workgroupSize = 64;
    float *myarray = new float[workgroupSize];

    kernel->in(workgroupSize);
    kernel->inout(workgroupSize, myarray);
    kernel->run_1d(workgroupSize, workgroupSize);

    delete[]myarray;
    delete kernel;
    delete cl;
}
Beispiel #13
0
TEST(testfloatarray, main) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", "");
    float in[5];
    float inout[5];
    float out[5];
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3;
        inout[i] = i * 3;
    }
    kernel->in(5, in);
    kernel->out(5, out);
    kernel->inout(5, inout);
    kernel->run_1d(5, 5);

    for(int i = 0; i < 5; i++) {
        cout << out[i] << " ";
    }
    cout << endl;
    for(int i = 0; i < 5; i++) {
        cout << inout[i] << " ";
    }
    cout << endl;

    assertEquals(inout[0], 7);
    assertEquals(inout[1] , 10);
    assertEquals(inout[2] , 34);
    assertEquals(inout[3] , 16);
    assertEquals(inout[4], 19);
    assertEquals(out[0] , 5);
    assertEquals(out[1] , 8);
    assertEquals(out[2] , 26);
    assertEquals(out[3] , 14);
    assertEquals(out[4] , 17);
    cout << "tests completed ok" << endl;

    delete kernel;
    delete cl;
}
Beispiel #14
0
TEST( testCopyBlock, testPos ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    float in[12];
    float res[12];
    in[0] = ( 3 << 10 ) | 4;
    in[1] = 8;
    in[2] = 14;

    for( int i = 0; i < 3; i++ ) {
        cout << "in[" << i << "]=" << in[i] << endl;
    }
    
    CLKernel *kernel = makeTestPosKernel( cl );
    kernel->in( 12, in )->out( 12, res );
    kernel->run_1d(1,1); 
    cl->finish();

    for( int i = 0; i < 5; i++ ) {
        cout << "res[" << i << "]=" << res[i] << endl;
    }
}
Beispiel #15
0
int main(int argc, char *argv[]) 
{
	const int test_size = 128;

	std::random_device rd;
	std::seed_seq s{ rd(), rd(), rd(), rd(), rd(), rd(), rd(), rd() };
	std::mt19937 mt(s);

	EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

	CLKernel *kern = cl->buildKernel("test.cl", "test");

	std::vector<uint32_t> inbuf(test_size*4), outbuf(test_size), compare(test_size);
	std::generate(inbuf.begin(), inbuf.end(), mt);

	std::cout << "Running CL implementation" << std::endl;
	kern->in(inbuf.size(), &inbuf[0]);
	kern->out(outbuf.size(), &outbuf[0]);
	size_t global_size[] = { test_size };
	kern->run(1, global_size, nullptr);
	delete kern;

	std::cout << "Running local implementation" << std::endl;
	for (int i = 0; i < compare.size(); ++i) {
		compare[i] = inbuf[i] ^ inbuf[i + 1] ^ inbuf[i + 2] ^ inbuf[i + 3];
	}

	std::cout << "Comparing CL test with local implementation" << std::endl;
	for (int i = 0; i < compare.size(); ++i) {
		if (outbuf[i] != compare[i]) {
			std::cout << "Error in index " << i << " " << outbuf[i] << " != " << compare[i] << std::endl;
		}
	}

	return 0;
}