TEST( testCopyBlock, basic ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); float a[] = { 1,2,3,4, 5,6,7,8, 9,10,11,12 }; float b[10]; memset(b, 0, sizeof(float)*10); CLKernel *kernel = makeBasicKernel( cl ); kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (0<<10)|1)->in((2<<10)|3); kernel->localFloats( 2 * 3 ); kernel->run_1d(12,4); // kernel->run_1d(12,12); cl->finish(); float expected[] = { 2,3,4, 6,7,8 }; for( int i = 0; i < 2; i++ ) { for( int j = 0; j < 3; j++ ) { cout << b[i*3+j] << " "; EXPECT_EQ( expected[i*3+j], b[i*3+j] ); } cout << endl; } cout << endl; for( int i = 6; i < 10; i++ ) { cout << b[i] << " "; EXPECT_EQ( 0, b[i] ); } cout << endl; cout << endl; kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (1<<10)|0)->in((2<<10)|3); kernel->localFloats( 2 * 3 ); // kernel->run_1d(12,4); kernel->run_1d(12,4); cl->finish(); float expected2[] = { 5,6,7, 9,10,11 }; for( int i = 0; i < 2; i++ ) { for( int j = 0; j < 3; j++ ) { cout << b[i*3+j] << " "; EXPECT_EQ( expected2[i*3+j], b[i*3+j] ); } cout << endl; } cout << endl; for( int i = 6; i < 10; i++ ) { cout << b[i] << " "; EXPECT_EQ( 0, b[i] ); } cout << endl; cout << endl; delete kernel; delete cl; }
TEST(SLOW_testlocal, selfdot_3levels_withoutscratch) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "selfdot_ints_withoutscratch"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int level3size = numWorkgroups / 4; const int N = workgroupSize * numWorkgroups * level3size; cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl; int *myarray = new int[N]; for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 5; } // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); // timer.timeCheck("copied array to device"); int *second = new int[N]; CLWrapper *secondwrapper = cl->wrap(N, second); int *a2 = new int[numWorkgroups*level3size]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups * level3size, a2); kernel->in(a1wrapper); kernel->out(secondwrapper); kernel->out(a2wrapper); kernel->run_1d(N, workgroupSize); cl->finish(); int *a3 = new int[numWorkgroups]; CLWrapper *a3wrapper = cl->wrap(level3size, a3); kernel->in(a2wrapper); kernel->out(secondwrapper); kernel->out(a3wrapper); kernel->run_1d(workgroupSize * level3size, workgroupSize); cl->finish(); int finalSum; kernel->in(a3wrapper); kernel->out(secondwrapper); kernel->out(1, &finalSum); kernel->run_1d(level3size, level3size); // timer.timeCheck("finished 3-level reduce"); EXPECT_EQ(-1306309159, finalSum); delete a1wrapper; delete a2wrapper; delete a3wrapper; delete secondwrapper; delete[] a3; delete[] second; delete[] a2; delete[]myarray; delete kernel; delete cl; }
TEST(testlocal, reduceviascratch_multipleworkgroups_ints) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups_ints"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int N = workgroupSize * numWorkgroups; cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl; int *myarray = new int[N]; int sumViaCpu = 0; int localSumViaCpu = 0; int localSumViaCpu2 = 0; int *localSumsViaCpu = new int[numWorkgroups]; memset(localSumsViaCpu, 0, sizeof(int)*numWorkgroups); for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 50; sumViaCpu += myarray[i]; if(i < workgroupSize) { localSumViaCpu += myarray[i]; } if(i >= workgroupSize && i < workgroupSize * 2) { localSumViaCpu2 += myarray[i]; } int workgroupId = i / workgroupSize; localSumsViaCpu[workgroupId] += myarray[i]; } ASSERT_EQ(localSumViaCpu, localSumsViaCpu[0]); ASSERT_EQ(localSumViaCpu2, localSumsViaCpu[1]); ASSERT_NE(myarray[0], sumViaCpu); // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); int *a2 = new int[numWorkgroups]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups, a2); kernel->in(a1wrapper); kernel->out(a2wrapper); kernel->localInts(workgroupSize); kernel->run_1d(N, workgroupSize); int finalSum; kernel->in(a2wrapper); kernel->out(1, &finalSum); kernel->localInts(workgroupSize); kernel->run_1d(numWorkgroups, workgroupSize); // timer.timeCheck("finished 2-way reduce"); EXPECT_EQ(sumViaCpu, finalSum); delete a1wrapper; delete a2wrapper; delete[] a2; delete[]myarray; delete kernel; delete cl; }
TEST( testCopyLocal, basic ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); float a[] = { 1,2,3,4, 5,6,7,8, 9,10,11,12 }; float b[16]; memset(b, 0, sizeof(float)*16); CLKernel *kernel = makeKernel( cl ); kernel->in( 12, a )->out( 16, b )->in( 12 ); kernel->localFloats( 12 ); kernel->run_1d(12,12); cl->finish(); for( int i = 0; i < 3; i++ ) { for( int j = 0; j < 4; j++ ) { cout << b[i*4+j] << " "; EXPECT_EQ( i * 4 + j + 1, b[i*4+j] ); } cout << endl; } cout << endl; for( int i = 12; i < 16; i++ ) { cout << b[i] << " "; EXPECT_EQ( 0, b[i] ); } cout << endl; delete kernel; delete cl; }
void CopyBuffer::copy( EasyCL *cl, CLWrapper *sourceWrapper, int *target ) { // first we will copy it to another buffer, so we can copy it out int bufferSize = sourceWrapper->size(); // float *copiedBuffer = new float[ bufferSize ]; CLWrapper *targetWrapper = cl->wrap( bufferSize, target ); targetWrapper->createOnDevice(); // now copy it, via a kernel const string kernelSource = "\n" "kernel void copy( int N, global int const *source, global int *dest ) {\n" " #define globalId ( get_global_id(0) )\n" " if( (int)globalId < N ) {\n" " dest[globalId] = source[globalId];\n" " }\n" " }\n"; CLKernel *kernel = cl->buildKernelFromString( kernelSource, "copy", "" ); kernel->in( bufferSize )->in( sourceWrapper )->out( targetWrapper ); int workgroupSize = 32; int numWorkgroups = ( bufferSize + workgroupSize - 1 ) / workgroupSize; kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize ); cl->finish(); targetWrapper->copyToHost(); delete targetWrapper; delete kernel; // delete[] copiedBuffer; }
// this runs an entire kernel to get one value. Clearly this is going to be pretty slow, but // at least it's more or less compatible, and comparable, to how cutorch does it // lgfgs expects a working implementation of this method float THClStorage_get(THClState *state, const THClStorage *self, long index) { //// printf("THClStorage_get\n"); THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty"); // if( self->wrapper->isDeviceDirty() ) { // if(state->trace) cout << "wrapper->copyToHost()" << endl; // self->wrapper->copyToHost(); // } // return self->data[index]; const char *uniqueName = __FILE__ ":get"; EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getGetKernelSource(), "THClStorageGet" ); } float res; kernel->out(1, &res); kernel->in(self->wrapper); kernel->in((int64_t)index); kernel->run_1d(1, 1); if(state->addFinish) cl->finish(); return res; }
// this runs an entire kernel to get one value. Clearly this is going to be pretty slow, but // at least it's more or less compatible, and comparable, to how cutorch does it void THClStorage_set(THClState *state, THClStorage *self, long index, float value) { //// cout << "set size=" << self->size << " index=" << index << " value=" << value << endl; THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty"); // if( self->wrapper->isDeviceDirty() ) { // we have to do this, since we're going to copy it all back again // // although I suppose we could set via a kernel perhaps // // either way, this function is pretty inefficient right now :-P // if(state->trace) cout << "wrapper->copyToHost() size " << self->size << endl; // self->wrapper->copyToHost(); // } // self->data[index] = value; // if(state->trace) cout << "wrapper->copyToDevice() size " << self->size << endl; // self->wrapper->copyToDevice(); const char *uniqueName = __FILE__ ":set"; EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getSetKernelSource(), "THClStorageSet" ); } kernel->inout(self->wrapper); kernel->in((int64_t)index); kernel->in(value); kernel->run_1d(1, 1); if(state->addFinish) cl->finish(); }
TEST( SLOW_testintwrapper_huge, testread ) { Timer timer; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_read"); // const int N = 4500000; // const int N = (4500000/512)*512; int N = 100000; int *out = new int[N]; CLWrapper *outwrapper = cl->wrap(N, out); kernel->in(3)->in(7); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; timer.timeCheck("before kernel"); kernel->run_1d( globalSize, workgroupsize ); timer.timeCheck("after kernel"); outwrapper->copyToHost(); timer.timeCheck("after copy to host"); for( int i = 0; i < N; i++ ) { if( out[i] != 4228 ) { cout << "out[" << i << "] != 4228: " << out[i] << endl; exit(-1); } } delete outwrapper; delete kernel; delete cl; }
TEST(test_scenario_te42kyfo, main) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); CLArrayFloat *out = cl->arrayFloat(5); CLArrayFloat *in = cl->arrayFloat(5); for(int i = 0; i < 5; i++) { (*out)[i] = 0; } for(int i = 0; i < 100; i++) { for(int n = 0; n < 5; n++) { (*in)[n] = i*n; } kernel->in(in); kernel->out(out); kernel->run_1d(5, 5); assertEquals(i*2 + 5, (*out)[2]); assertEquals(i*4 + 5, (*out)[4]); } cout << "finished" << endl; delete in; delete out; delete kernel; delete cl; }
TEST(testfloatwrapperconst, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float in[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3; } float out[5]; CLWrapper *inwrapper = cl->wrap(5, (float const *)in); CLWrapper *outwrapper = cl->wrap(5, out); inwrapper->copyToDevice(); kernel->input(inwrapper); kernel->output(outwrapper); kernel->run_1d(5, 5); outwrapper->copyToHost(); assertEquals(out[0] , 7); assertEquals(out[1] , 10); assertEquals(out[2] , 13); assertEquals(out[3] , 16); assertEquals(out[4] , 19); cout << "tests completed ok" << endl; delete inwrapper; delete outwrapper; delete kernel; delete cl; }
TEST( SLOW_testintwrapper_huge, testreadwrite ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_stress"); const int N = 1000000; int *in = new int[N]; for( int i = 0; i < N; i++ ) { in[i] = i * 3; } int *out = new int[N]; CLWrapper *inwrapper = cl->wrap(N, in); CLWrapper *outwrapper = cl->wrap(N, out); inwrapper->copyToDevice(); outwrapper->createOnDevice(); kernel->input( inwrapper ); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; kernel->run_1d( globalSize, workgroupsize ); outwrapper->copyToHost(); for( int i = 0; i < N; i++ ) { if( out[i] != 689514 ) { cout << "out[" << i << "] != 689514: " << out[i] << endl; exit(-1); } } delete outwrapper; delete inwrapper; delete kernel; delete cl; }
TEST(testlocal, reduceviascratch_multipleworkgroups) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int N = workgroupSize * numWorkgroups; float *myarray = new float[N]; float sumViaCpu = 0; float localSumViaCpu = 0; for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 10; sumViaCpu += myarray[i]; if(i < workgroupSize) { localSumViaCpu += myarray[i]; } } cout << "expected sum, calc'd via cpu, : " << sumViaCpu << endl; EXPECT_NE(myarray[0], sumViaCpu); // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); float *a2 = new float[numWorkgroups]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups, a2); kernel->in(a1wrapper); kernel->out(a2wrapper); kernel->localFloats(workgroupSize); kernel->run_1d(N, workgroupSize); float finalSum; kernel->in(a2wrapper); kernel->out(1, &finalSum); kernel->localFloats(workgroupSize); kernel->run_1d(numWorkgroups, workgroupSize); EXPECT_EQ(sumViaCpu, finalSum); delete a1wrapper; delete a2wrapper; delete[] a2; delete[]myarray; delete kernel; delete cl; }
TEST( testdefines, simple ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testdefines.cl", "testDefines", "-D DOUBLE -D SOME_VALUE=5" ); float out[32]; kernel->out( 32, out ); kernel->run_1d( 32, 32 ); EXPECT_EQ( 10, out[3] ); delete kernel; delete cl; }
TEST(testlocal, notUselocal) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "notUseLocal"); int workgroupSize = 64; float *myarray = new float[workgroupSize]; kernel->in(workgroupSize); kernel->inout(workgroupSize, myarray); kernel->run_1d(workgroupSize, workgroupSize); delete[]myarray; delete kernel; delete cl; }
TEST( testMemset, basic ) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); CLKernel *kMemset = 0; // [[[cog // import stringify // stringify.write_kernel2( "kMemset", "cl/memset.cl", "cl_memset", '""' ) // ]]] // generated using cog, from cl/memset.cl: const char * kMemsetSource = "// Copyright Hugh Perkins 2015 hughperkins at gmail\n" "//\n" "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" "// obtain one at http://mozilla.org/MPL/2.0/.\n" "\n" "kernel void cl_memset(global float *target, const float value, const int N) {\n" " #define globalId get_global_id(0)\n" " if ((int)globalId < N) {\n" " target[globalId] = value;\n" " }\n" "}\n" "\n" ""; kMemset = cl->buildKernelFromString(kMemsetSource, "cl_memset", "", "cl/memset.cl"); // [[[end]]] int N = 10000; float *myArray = new float[N]; CLWrapper *myArrayWrapper = cl->wrap( N, myArray ); myArrayWrapper->createOnDevice(); kMemset->out( myArrayWrapper )->in( 99.0f )->in( N ); int workgroupSize = 64; kMemset->run_1d( ( N + workgroupSize - 1 ) / workgroupSize * workgroupSize, workgroupSize ); cl->finish(); myArrayWrapper->copyToHost(); for( int i = 0; i < 10; i++ ) { // cout << "myArray[" << i << "]=" << myArray[i] << endl; } for( int i = 0; i < N; i++ ) { EXPECT_EQ( 99.0f, myArray[i] ); } delete kMemset; delete cl; }
TEST(testfloatarray, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float in[5]; float inout[5]; float out[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3; inout[i] = i * 3; } kernel->in(5, in); kernel->out(5, out); kernel->inout(5, inout); kernel->run_1d(5, 5); for(int i = 0; i < 5; i++) { cout << out[i] << " "; } cout << endl; for(int i = 0; i < 5; i++) { cout << inout[i] << " "; } cout << endl; assertEquals(inout[0], 7); assertEquals(inout[1] , 10); assertEquals(inout[2] , 34); assertEquals(inout[3] , 16); assertEquals(inout[4], 19); assertEquals(out[0] , 5); assertEquals(out[1] , 8); assertEquals(out[2] , 26); assertEquals(out[3] , 14); assertEquals(out[4] , 17); cout << "tests completed ok" << endl; delete kernel; delete cl; }
VIRTUAL void GpuOp::apply1_inplace( int N, CLWrapper*destinationWrapper, Op1 *op ) { StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start" ); string kernelName = "GpuOp::" + op->getName() + "_inplace"; if( !cl->kernelExists( kernelName ) ) { buildKernel( kernelName, op, true ); } CLKernel *kernel = cl->getKernel( kernelName ); kernel->in( N ); kernel->inout( destinationWrapper ); int globalSize = N; int workgroupSize = 64; int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize; kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize ); cl->finish(); StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end" ); }
TEST( testCopyBlock, testPos ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); float in[12]; float res[12]; in[0] = ( 3 << 10 ) | 4; in[1] = 8; in[2] = 14; for( int i = 0; i < 3; i++ ) { cout << "in[" << i << "]=" << in[i] << endl; } CLKernel *kernel = makeTestPosKernel( cl ); kernel->in( 12, in )->out( 12, res ); kernel->run_1d(1,1); cl->finish(); for( int i = 0; i < 5; i++ ) { cout << "res[" << i << "]=" << res[i] << endl; } }
TEST(testlocal, localreduce) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); float *myarray = new float[workgroupSize]; // Timer timer; for(int i = 0; i < 2000; i++) { float sumViaCpu = 0; for(int i = 0; i < workgroupSize; i++) { myarray[i] = (i + 7) * 3; sumViaCpu += myarray[i]; } EXPECT_NE(myarray[0], sumViaCpu); kernel->inout(workgroupSize, myarray)->localFloats(workgroupSize); kernel->run_1d(workgroupSize, workgroupSize); EXPECT_EQ(myarray[0], sumViaCpu); } // timer.timeCheck("after iterations"); delete[]myarray; delete kernel; delete cl; }