// this runs an entire kernel to get one value. Clearly this is going to be pretty slow, but // at least it's more or less compatible, and comparable, to how cutorch does it void THClStorage_set(THClState *state, THClStorage *self, long index, float value) { //// cout << "set size=" << self->size << " index=" << index << " value=" << value << endl; THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty"); // if( self->wrapper->isDeviceDirty() ) { // we have to do this, since we're going to copy it all back again // // although I suppose we could set via a kernel perhaps // // either way, this function is pretty inefficient right now :-P // if(state->trace) cout << "wrapper->copyToHost() size " << self->size << endl; // self->wrapper->copyToHost(); // } // self->data[index] = value; // if(state->trace) cout << "wrapper->copyToDevice() size " << self->size << endl; // self->wrapper->copyToDevice(); const char *uniqueName = __FILE__ ":set"; EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getSetKernelSource(), "THClStorageSet" ); } kernel->inout(self->wrapper); kernel->in((int64_t)index); kernel->in(value); kernel->run_1d(1, 1); if(state->addFinish) cl->finish(); }
TEST( SLOW_testintwrapper_huge, testread ) { Timer timer; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_read"); // const int N = 4500000; // const int N = (4500000/512)*512; int N = 100000; int *out = new int[N]; CLWrapper *outwrapper = cl->wrap(N, out); kernel->in(3)->in(7); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; timer.timeCheck("before kernel"); kernel->run_1d( globalSize, workgroupsize ); timer.timeCheck("after kernel"); outwrapper->copyToHost(); timer.timeCheck("after copy to host"); for( int i = 0; i < N; i++ ) { if( out[i] != 4228 ) { cout << "out[" << i << "] != 4228: " << out[i] << endl; exit(-1); } } delete outwrapper; delete kernel; delete cl; }
TEST(test_scenario_te42kyfo, main) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); CLArrayFloat *out = cl->arrayFloat(5); CLArrayFloat *in = cl->arrayFloat(5); for(int i = 0; i < 5; i++) { (*out)[i] = 0; } for(int i = 0; i < 100; i++) { for(int n = 0; n < 5; n++) { (*in)[n] = i*n; } kernel->in(in); kernel->out(out); kernel->run_1d(5, 5); assertEquals(i*2 + 5, (*out)[2]); assertEquals(i*4 + 5, (*out)[4]); } cout << "finished" << endl; delete in; delete out; delete kernel; delete cl; }
TEST(testinout, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float inout[5]; for(int i = 0; i < 5; i++) { inout[i] = i * 3; } kernel->inout(5, inout); size_t global = 5; size_t local = 5; kernel->run(1, &global, &local); assertEquals(inout[0] , 7); assertEquals(inout[1] , 10); assertEquals(inout[2] , 13); assertEquals(inout[3] , 16); assertEquals(inout[4] , 19); cout << "tests completed ok" << endl; delete kernel; delete cl; }
TEST( SLOW_testintwrapper_huge, testreadwrite ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_stress"); const int N = 1000000; int *in = new int[N]; for( int i = 0; i < N; i++ ) { in[i] = i * 3; } int *out = new int[N]; CLWrapper *inwrapper = cl->wrap(N, in); CLWrapper *outwrapper = cl->wrap(N, out); inwrapper->copyToDevice(); outwrapper->createOnDevice(); kernel->input( inwrapper ); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; kernel->run_1d( globalSize, workgroupsize ); outwrapper->copyToHost(); for( int i = 0; i < N; i++ ) { if( out[i] != 689514 ) { cout << "out[" << i << "] != 689514: " << out[i] << endl; exit(-1); } } delete outwrapper; delete inwrapper; delete kernel; delete cl; }
// this runs an entire kernel to get one value. Clearly this is going to be pretty slow, but // at least it's more or less compatible, and comparable, to how cutorch does it // lgfgs expects a working implementation of this method float THClStorage_get(THClState *state, const THClStorage *self, long index) { //// printf("THClStorage_get\n"); THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty"); // if( self->wrapper->isDeviceDirty() ) { // if(state->trace) cout << "wrapper->copyToHost()" << endl; // self->wrapper->copyToHost(); // } // return self->data[index]; const char *uniqueName = __FILE__ ":get"; EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getGetKernelSource(), "THClStorageGet" ); } float res; kernel->out(1, &res); kernel->in(self->wrapper); kernel->in((int64_t)index); kernel->run_1d(1, 1); if(state->addFinish) cl->finish(); return res; }
TEST(testCLMathWrapper, multiplyinplace) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); float adat[] = { 1,3,9,12.5f,2.5f }; // float bdat[] = { 4,2.1f, 5,3,9.2f }; CLWrapper *a_ = cl->wrap(5,adat); // CLWrapper *b_ = cl->wrap(5,bdat); a_->copyToDevice(); // b_->copyToDevice(); CLMathWrapper a(a_); // CLMathWrapper b(b_); // a += b; a *= 1.5f; a_->copyToHost(); for(int i = 0; i < 5; i++) { cout << "a[" << i << "]=" << adat[i] << endl; } EXPECT_FLOAT_NEAR(1.5f, adat[0]); EXPECT_FLOAT_NEAR(4.5f, adat[1]); EXPECT_FLOAT_NEAR(3.75f, adat[4]); // delete a; // delete b; delete a_; // delete b_; delete cl; }
TEST(testfloatwrapperconst, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float in[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3; } float out[5]; CLWrapper *inwrapper = cl->wrap(5, (float const *)in); CLWrapper *outwrapper = cl->wrap(5, out); inwrapper->copyToDevice(); kernel->input(inwrapper); kernel->output(outwrapper); kernel->run_1d(5, 5); outwrapper->copyToHost(); assertEquals(out[0] , 7); assertEquals(out[1] , 10); assertEquals(out[2] , 13); assertEquals(out[3] , 16); assertEquals(out[4] , 19); cout << "tests completed ok" << endl; delete inwrapper; delete outwrapper; delete kernel; delete cl; }
TEST( testforward, compare_1_n_biased_pad ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); int maxWorkgroupSize = cl->getMaxWorkgroupSize(); delete cl; LayerDimensions dim; int batchSize = 4; int N = 4; string activationName = "tanh"; dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 ) .setFilterSize( 5 ) .setPadZeros( true ).setBiased( true ); for( int instance = 2; instance <= 7; instance++ ) { if( instance == 5 ) { continue; // forwardfc, cant use for inputimagesize != filtersize } dim.setInputSize(19); if(instance == 2 && maxWorkgroupSize < 19 * 19) { dim.setInputSize(15); } if(instance == 3 && maxWorkgroupSize < 19 * 19) { dim.setInputSize(15); } cout << "instance: " << instance << endl; compareSpecific( false, N, batchSize, dim, 1, instance ); } }
TEST(testcopybuffer, throwsifnotondevice) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); const int bufferSize = 100 * 1024 / 4; float *in = new float[bufferSize]; float *in2 = new float[bufferSize]; for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; in2[i] = 23 + i; } CLWrapper *inwrapper = cl->wrap(bufferSize, in); CLWrapper *in2wrapper = cl->wrap(bufferSize, in2); inwrapper->copyToDevice(); // in2wrapper->copyToDevice(); bool threw = false; try { inwrapper->copyTo(in2wrapper); } catch(runtime_error &e) { threw = true; } EXPECT_TRUE(threw); delete cl; }
TEST( testCopyLocal, basic ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); float a[] = { 1,2,3,4, 5,6,7,8, 9,10,11,12 }; float b[16]; memset(b, 0, sizeof(float)*16); CLKernel *kernel = makeKernel( cl ); kernel->in( 12, a )->out( 16, b )->in( 12 ); kernel->localFloats( 12 ); kernel->run_1d(12,12); cl->finish(); for( int i = 0; i < 3; i++ ) { for( int j = 0; j < 4; j++ ) { cout << b[i*4+j] << " "; EXPECT_EQ( i * 4 + j + 1, b[i*4+j] ); } cout << endl; } cout << endl; for( int i = 12; i < 16; i++ ) { cout << b[i] << " "; EXPECT_EQ( 0, b[i] ); } cout << endl; delete kernel; delete cl; }
TEST(testcopybuffer, withoffset) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); float in[10]; float in2[10]; for(int i = 0; i < 10; i++) { in[i] = i * 3; in2[i] = 23 + i; } float out[10]; CLWrapper *inwrapper = cl->wrap(10, in); CLWrapper *in2wrapper = cl->wrap(10, in2); CLWrapper *outwrapper = cl->wrap(10, out); inwrapper->copyToDevice(); in2wrapper->copyToDevice(); EXPECT_FALSE(in2wrapper->isDeviceDirty()); inwrapper->copyTo(in2wrapper, 2, 5, 4); EXPECT_TRUE(in2wrapper->isDeviceDirty()); // cl->finish(); // check that in2 host-side unchanged: for(int i = 0; i < 10; i++) { in[i] = i * 3; EXPECT_EQ(23 + i, in2[i]); } in2wrapper->copyToHost(); // check that in2 is now a partial copy of in: for(int i = 0; i < 10; i++) { // in[i] = i * 3; if(i >= 5 && i < 9) { EXPECT_EQ((i-3) * 3, in2[i]); } else { EXPECT_EQ(23 + i, in2[i]); } } // check that modifying in2 doesnt modfiy in: in2[1] = 27; in2wrapper->copyToDevice(); inwrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); in2wrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); EXPECT_EQ(27, in2[1]); delete inwrapper; delete in2wrapper; delete outwrapper; delete cl; }
TEST( testCopyBlock, basic ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); float a[] = { 1,2,3,4, 5,6,7,8, 9,10,11,12 }; float b[10]; memset(b, 0, sizeof(float)*10); CLKernel *kernel = makeBasicKernel( cl ); kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (0<<10)|1)->in((2<<10)|3); kernel->localFloats( 2 * 3 ); kernel->run_1d(12,4); // kernel->run_1d(12,12); cl->finish(); float expected[] = { 2,3,4, 6,7,8 }; for( int i = 0; i < 2; i++ ) { for( int j = 0; j < 3; j++ ) { cout << b[i*3+j] << " "; EXPECT_EQ( expected[i*3+j], b[i*3+j] ); } cout << endl; } cout << endl; for( int i = 6; i < 10; i++ ) { cout << b[i] << " "; EXPECT_EQ( 0, b[i] ); } cout << endl; cout << endl; kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (1<<10)|0)->in((2<<10)|3); kernel->localFloats( 2 * 3 ); // kernel->run_1d(12,4); kernel->run_1d(12,4); cl->finish(); float expected2[] = { 5,6,7, 9,10,11 }; for( int i = 0; i < 2; i++ ) { for( int j = 0; j < 3; j++ ) { cout << b[i*3+j] << " "; EXPECT_EQ( expected2[i*3+j], b[i*3+j] ); } cout << endl; } cout << endl; for( int i = 6; i < 10; i++ ) { cout << b[i] << " "; EXPECT_EQ( 0, b[i] ); } cout << endl; cout << endl; delete kernel; delete cl; }
TEST(testlocal, reduceviascratch_multipleworkgroups_ints) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups_ints"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int N = workgroupSize * numWorkgroups; cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl; int *myarray = new int[N]; int sumViaCpu = 0; int localSumViaCpu = 0; int localSumViaCpu2 = 0; int *localSumsViaCpu = new int[numWorkgroups]; memset(localSumsViaCpu, 0, sizeof(int)*numWorkgroups); for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 50; sumViaCpu += myarray[i]; if(i < workgroupSize) { localSumViaCpu += myarray[i]; } if(i >= workgroupSize && i < workgroupSize * 2) { localSumViaCpu2 += myarray[i]; } int workgroupId = i / workgroupSize; localSumsViaCpu[workgroupId] += myarray[i]; } ASSERT_EQ(localSumViaCpu, localSumsViaCpu[0]); ASSERT_EQ(localSumViaCpu2, localSumsViaCpu[1]); ASSERT_NE(myarray[0], sumViaCpu); // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); int *a2 = new int[numWorkgroups]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups, a2); kernel->in(a1wrapper); kernel->out(a2wrapper); kernel->localInts(workgroupSize); kernel->run_1d(N, workgroupSize); int finalSum; kernel->in(a2wrapper); kernel->out(1, &finalSum); kernel->localInts(workgroupSize); kernel->run_1d(numWorkgroups, workgroupSize); // timer.timeCheck("finished 2-way reduce"); EXPECT_EQ(sumViaCpu, finalSum); delete a1wrapper; delete a2wrapper; delete[] a2; delete[]myarray; delete kernel; delete cl; }
TEST(testcopybuffer, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); float in[5]; float in2[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3.0f; in2[i] = 23.0f + i; } float out[5]; CLWrapper *inwrapper = cl->wrap(5, in); CLWrapper *in2wrapper = cl->wrap(5, in2); CLWrapper *outwrapper = cl->wrap(5, out); inwrapper->copyToDevice(); in2wrapper->copyToDevice(); EXPECT_FALSE(in2wrapper->isDeviceDirty()); inwrapper->copyTo(in2wrapper); EXPECT_TRUE(in2wrapper->isDeviceDirty()); // cl->finish(); // check that in2 host-side unchanged: for(int i = 0; i < 5; i++) { in[i] = i * 3.0f; EXPECT_EQ(23.0f + i, in2[i]); } in2wrapper->copyToHost(); // check that in2 is now a copy of in: for(int i = 0; i < 5; i++) { in[i] = i * 3.0f; EXPECT_EQ(i * 3.0f, in2[i]); } // check that modifying in2 doesnt modfiy in: in2[1] = 27; in2wrapper->copyToDevice(); inwrapper->copyToHost(); EXPECT_EQ(1 * 3.0f, in[1]); in2wrapper->copyToHost(); EXPECT_EQ(1 * 3.0f, in[1]); EXPECT_EQ(27.0f, in2[1]); delete inwrapper; delete in2wrapper; delete outwrapper; delete cl; }
TEST( testdefines, simple ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testdefines.cl", "testDefines", "-D DOUBLE -D SOME_VALUE=5" ); float out[32]; kernel->out( 32, out ); kernel->run_1d( 32, 32 ); EXPECT_EQ( 10, out[3] ); delete kernel; delete cl; }
TEST(SLOW_testcopybuffer, larger) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); const int bufferSize = 100 * 1024 * 1024 / 4; // 100MB (of floats) float *in = new float[bufferSize]; float *in2 = new float[bufferSize]; for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; in2[i] = 23 + i; } CLWrapper *inwrapper = cl->wrap(bufferSize, in); CLWrapper *in2wrapper = cl->wrap(bufferSize, in2); inwrapper->copyToDevice(); in2wrapper->copyToDevice(); inwrapper->copyTo(in2wrapper); // cl->finish(); // check that in2 host-side unchanged: for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; EXPECT_EQ(23 + i, in2[i]); } in2wrapper->copyToHost(); // check that in2 is now a copy of in: for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; EXPECT_EQ(i * 3, in2[i]); } // check that modifying in2 doesnt modfiy in: in2[1] = 27; in2wrapper->copyToDevice(); inwrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); in2wrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); EXPECT_EQ(27, in2[1]); delete inwrapper; delete in2wrapper; delete[] in; delete[] in2; delete cl; }
TEST(testClBlas, colMajor2) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); float A[] = {1, 3, 2, 7, 9, 5, 0, -2}; float B[] = {3,2,8, -1,0,4}; float C[4*3]; transpose(A, 4, 2); transpose(B, 2, 3); // for(int row=0; row < 2; row++) { // for(int col=0; col < 1; col++) { // cout << B[row*1 + col] << " "; // } // cout << endl; // } ClBlasInstance clblasInstance; // ClBlasInstance::initializeIfNecessary(); CLWrapper *AWrap = cl->wrap(4*2, A); CLWrapper *BWrap = cl->wrap(2*3, B); CLWrapper *CWrap = cl->wrap(4*3, C); AWrap->copyToDevice(); BWrap->copyToDevice(); ClBlasHelper::Gemm( cl, clblasColumnMajor, clblasNoTrans, clblasNoTrans, 4, 2, 3, 1, AWrap, 0, BWrap, 0, 0, CWrap, 0 ); // cl->finish(); CWrap->copyToHost(); transpose(C, 3, 4); EXPECT_EQ(1*3-1*3, C[0]); EXPECT_EQ(1*2+3*0, C[1]); EXPECT_EQ(1*8+4*3, C[2]); EXPECT_EQ(-8, C[11]); delete CWrap; delete BWrap; delete AWrap; delete cl; }
TEST(SLOW_testlocal, selfdot_3levels_withoutscratch) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "selfdot_ints_withoutscratch"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int level3size = numWorkgroups / 4; const int N = workgroupSize * numWorkgroups * level3size; cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl; int *myarray = new int[N]; for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 5; } // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); // timer.timeCheck("copied array to device"); int *second = new int[N]; CLWrapper *secondwrapper = cl->wrap(N, second); int *a2 = new int[numWorkgroups*level3size]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups * level3size, a2); kernel->in(a1wrapper); kernel->out(secondwrapper); kernel->out(a2wrapper); kernel->run_1d(N, workgroupSize); cl->finish(); int *a3 = new int[numWorkgroups]; CLWrapper *a3wrapper = cl->wrap(level3size, a3); kernel->in(a2wrapper); kernel->out(secondwrapper); kernel->out(a3wrapper); kernel->run_1d(workgroupSize * level3size, workgroupSize); cl->finish(); int finalSum; kernel->in(a3wrapper); kernel->out(secondwrapper); kernel->out(1, &finalSum); kernel->run_1d(level3size, level3size); // timer.timeCheck("finished 3-level reduce"); EXPECT_EQ(-1306309159, finalSum); delete a1wrapper; delete a2wrapper; delete a3wrapper; delete secondwrapper; delete[] a3; delete[] second; delete[] a2; delete[]myarray; delete kernel; delete cl; }
TEST(testlocal, notUselocal) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "notUseLocal"); int workgroupSize = 64; float *myarray = new float[workgroupSize]; kernel->in(workgroupSize); kernel->inout(workgroupSize, myarray); kernel->run_1d(workgroupSize, workgroupSize); delete[]myarray; delete kernel; delete cl; }
TEST(testqueues, defaultqueue) { cout << "start" << endl; if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float in[5]; float out[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3; out[i] = 0; } kernel->input(5, in); kernel->output(5, out); size_t global = 5; size_t local = 5; kernel->run(1, &global, &local); cl->finish(); assertEquals(out[0] , 7); assertEquals(out[1] , 10); assertEquals(out[2] , 13); assertEquals(out[3] , 16); assertEquals(out[4] , 19); for(int i = 0; i < 5; i++) { in[i] = i * 3; out[i] = 0; } cout << "cl->queue " << (void *)cl->queue << endl; cout << "*cl->queue " << (void *)*cl->queue << endl; cout << "cl->default_queue->queue" << (void *)cl->default_queue->queue << endl; kernel->input(5, in); kernel->output(5, out); kernel->run(cl->default_queue, 1, &global, &local); cl->finish(); assertEquals(out[0] , 7); assertEquals(out[1] , 10); assertEquals(out[2] , 13); assertEquals(out[3] , 16); assertEquals(out[4] , 19); cout << "tests completed ok" << endl; }
TEST(testClBlas, colMajorTransB) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); float A[] = {1, 3, 2, 7, 9, 5}; float B[] = {3, -1}; float C[3]; transpose(A, 3, 2); // transpose(B, 2, 1); // for(int row=0; row < 2; row++) { // for(int col=0; col < 1; col++) { // cout << B[row*1 + col] << " "; // } // cout << endl; // } ClBlasInstance clblasInstance; // ClBlasInstance::initializeIfNecessary(); CLWrapper *AWrap = cl->wrap(6, A); CLWrapper *BWrap = cl->wrap(2, B); CLWrapper *CWrap = cl->wrap(3, C); AWrap->copyToDevice(); BWrap->copyToDevice(); ClBlasHelper::Gemm( cl, clblasColumnMajor, clblasNoTrans, clblasTrans, 3, 2, 1, 1, AWrap, 0, BWrap, 0, 0, CWrap, 0 ); // cl->finish(); CWrap->copyToHost(); transpose(C, 1, 3); EXPECT_EQ(0, C[0]); EXPECT_EQ(-1, C[1]); EXPECT_EQ(22, C[2]); delete CWrap; delete BWrap; delete AWrap; delete cl; }
void measurePerf(int instance, int batchSize, LayerDimensions dim) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); int inputNumElements = dim.inputCubeSize * batchSize; int errorsSize = dim.outputCubeSize * batchSize; int weightsSize = dim.filtersSize; int errorsForUpstreamSize = dim.inputCubeSize * batchSize; float *input = new float[inputNumElements]; float *errors = new float[errorsSize]; float *weights = new float[weightsSize]; WeightRandomizer::randomize(input, inputNumElements, -0.1f, 0.1f); WeightRandomizer::randomize(errors, errorsSize, -0.1f, 0.1f); WeightRandomizer::randomize(weights, weightsSize, -0.1f, 0.1f); float *errorsForUpstream = new float[errorsForUpstreamSize]; CLWrapper *inputWrapper = cl->wrap(inputNumElements, input); CLWrapper *errorsWrapper = cl->wrap(errorsSize, errors); CLWrapper *weightsWrapper = cl->wrap(weightsSize, weights); CLWrapper *errorsForUpstreamWrapper = cl->wrap(errorsForUpstreamSize, errorsForUpstream); inputWrapper->copyToDevice(); errorsWrapper->copyToDevice(); weightsWrapper->copyToDevice(); errorsForUpstreamWrapper->createOnDevice(); StatefulTimer::timeCheck("after init"); Backward *backwardImpl = Backward::instanceSpecific(instance, cl, dim); for(int it = 0; it < 40; it++) { backwardImpl->backward(batchSize, inputWrapper, errorsWrapper, weightsWrapper, errorsForUpstreamWrapper); } StatefulTimer::timeCheck("after backprop"); StatefulTimer::dump(true); delete errorsForUpstreamWrapper; delete weightsWrapper; delete inputWrapper; delete errorsWrapper; delete[] errors; delete[] weights; delete[] input; delete[] errorsForUpstream; delete backwardImpl; delete cl; }
TEST( testMemset, basic ) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); CLKernel *kMemset = 0; // [[[cog // import stringify // stringify.write_kernel2( "kMemset", "cl/memset.cl", "cl_memset", '""' ) // ]]] // generated using cog, from cl/memset.cl: const char * kMemsetSource = "// Copyright Hugh Perkins 2015 hughperkins at gmail\n" "//\n" "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" "// obtain one at http://mozilla.org/MPL/2.0/.\n" "\n" "kernel void cl_memset(global float *target, const float value, const int N) {\n" " #define globalId get_global_id(0)\n" " if ((int)globalId < N) {\n" " target[globalId] = value;\n" " }\n" "}\n" "\n" ""; kMemset = cl->buildKernelFromString(kMemsetSource, "cl_memset", "", "cl/memset.cl"); // [[[end]]] int N = 10000; float *myArray = new float[N]; CLWrapper *myArrayWrapper = cl->wrap( N, myArray ); myArrayWrapper->createOnDevice(); kMemset->out( myArrayWrapper )->in( 99.0f )->in( N ); int workgroupSize = 64; kMemset->run_1d( ( N + workgroupSize - 1 ) / workgroupSize * workgroupSize, workgroupSize ); cl->finish(); myArrayWrapper->copyToHost(); for( int i = 0; i < 10; i++ ) { // cout << "myArray[" << i << "]=" << myArray[i] << endl; } for( int i = 0; i < N; i++ ) { EXPECT_EQ( 99.0f, myArray[i] ); } delete kMemset; delete cl; }
void col2im(THClState *state, THClTensor* col, const int channels, const int height, const int width, const int patch_h, const int patch_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, THClTensor* im) { int height_col = (height + 2 * pad_h - patch_h) / stride_h + 1; int width_col = (width + 2 * pad_w - patch_w) / stride_w + 1; int num_kernels = channels * height * width; // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. EasyCL *cl = im->storage->cl; std::string uniqueName = "SpatialConvolutionMM::col2im"; CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel(uniqueName, "SpatialConvolutionMM.cl", SpatialConvolutionMM_getKernelTemplate(), "col2im_kernel"); } THClKernels k(state, kernel); k.in(num_kernels); k.in(col); k.in(height); k.in(width); k.in(channels); k.in(patch_h); k.in(patch_w); k.in(pad_h); k.in(pad_w); k.in(stride_h); k.in(stride_w); k.in(height_col); k.in(width_col); k.out(im); k.run(GET_BLOCKS(state, num_kernels), getNumThreads(state)); // col2im_kernel <<<GET_BLOCKS(num_kernels), CL_NUM_THREADS, 0, stream>>> ( // num_kernels, data_col, height, width, channels, // patch_h, patch_w, pad_h, pad_w, stride_h, stride_w, // height_col, width_col, data_im // ); // THError("Not implemented"); }
TEST( testpoolingforward, fromwrappers ) { int batchSize = 1; int numPlanes = 1; int imageSize = 4; int poolingSize = 2; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); PoolingForward *poolingForward = PoolingForward::instanceSpecific( 1, cl, false, numPlanes, imageSize, poolingSize ); float input[] = { 1, 2, 5, 3, 3, 8, 4, 1, 3, 33, 14,23, -1, -3.5f,37.4f,5 }; int outputSize = poolingForward->getOutputSize( batchSize ); int *selectors = new int[outputSize]; float *output = new float[outputSize]; const int inputSize = batchSize * numPlanes * imageSize * imageSize; CLWrapper *inputWrapper = cl->wrap( inputSize, input ); CLWrapper *selectorsWrapper = cl->wrap( outputSize, selectors ); CLWrapper *outputWrapper = cl->wrap( outputSize, output ); inputWrapper->copyToDevice(); poolingForward->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper ); selectorsWrapper->copyToHost(); outputWrapper->copyToHost(); EXPECT_EQ( selectors[0], 3 ); EXPECT_EQ( selectors[1], 0 ); EXPECT_EQ( selectors[2], 1 ); EXPECT_EQ( selectors[3], 2 ); EXPECT_EQ( output[0], 8 ); EXPECT_EQ( output[1], 5 ); EXPECT_EQ( output[2], 33 ); EXPECT_EQ( output[3], 37.4f ); delete inputWrapper; delete selectorsWrapper; delete outputWrapper; delete poolingForward; delete[] selectors; delete[] output; delete cl; }
void THClStorage_resize(THClState *state, THClStorage *self, long size) { StatefulTimer::timeCheck("THClStorage_resize START"); if( size <= self->size ) { return; } delete self->wrapper; if(state->trace && self->size > 0) cout << "delete wrapper" << endl; delete[] self->data; self->data = new float[size]; EasyCL *cl = self->cl; self->wrapper = cl->wrap( size, self->data ); self->wrapper->createOnDevice(); if(state->trace) cout << "new wrapper, size " << size << endl; self->size = size; StatefulTimer::timeCheck("THClStorage_resize END"); }
TEST(testlocal, reduceviascratch_multipleworkgroups) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int N = workgroupSize * numWorkgroups; float *myarray = new float[N]; float sumViaCpu = 0; float localSumViaCpu = 0; for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 10; sumViaCpu += myarray[i]; if(i < workgroupSize) { localSumViaCpu += myarray[i]; } } cout << "expected sum, calc'd via cpu, : " << sumViaCpu << endl; EXPECT_NE(myarray[0], sumViaCpu); // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); float *a2 = new float[numWorkgroups]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups, a2); kernel->in(a1wrapper); kernel->out(a2wrapper); kernel->localFloats(workgroupSize); kernel->run_1d(N, workgroupSize); float finalSum; kernel->in(a2wrapper); kernel->out(1, &finalSum); kernel->localFloats(workgroupSize); kernel->run_1d(numWorkgroups, workgroupSize); EXPECT_EQ(sumViaCpu, finalSum); delete a1wrapper; delete a2wrapper; delete[] a2; delete[]myarray; delete kernel; delete cl; }
TEST(testfloatarray, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float in[5]; float inout[5]; float out[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3; inout[i] = i * 3; } kernel->in(5, in); kernel->out(5, out); kernel->inout(5, inout); kernel->run_1d(5, 5); for(int i = 0; i < 5; i++) { cout << out[i] << " "; } cout << endl; for(int i = 0; i < 5; i++) { cout << inout[i] << " "; } cout << endl; assertEquals(inout[0], 7); assertEquals(inout[1] , 10); assertEquals(inout[2] , 34); assertEquals(inout[3] , 16); assertEquals(inout[4], 19); assertEquals(out[0] , 5); assertEquals(out[1] , 8); assertEquals(out[2] , 26); assertEquals(out[3] , 14); assertEquals(out[4] , 17); cout << "tests completed ok" << endl; delete kernel; delete cl; }
void im2col(THClState *state, THClTensor* im, const int channels, const int height, const int width, const int ksize_h, const int ksize_w, const int pad_h, const int pad_w, const int stride_h, const int stride_w, THClTensor* col) { // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. int height_col = (height + 2 * pad_h - ksize_h) / stride_h + 1; int width_col = (width + 2 * pad_w - ksize_w) / stride_w + 1; int num_kernels = channels * height_col * width_col; std::string uniqueName = "SpatialConvolutionMM::im2col"; EasyCL *cl = im->storage->cl; CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel(uniqueName, "SpatialConvolutionMM.cl", SpatialConvolutionMM_getKernelTemplate(), "im2col_kernel"); } THClKernels k(state, kernel); k.in(num_kernels); k.in(im); k.in(height); k.in(width); k.in(ksize_h); k.in(ksize_w); k.in(pad_h); k.in(pad_w); k.in(stride_h); k.in(stride_w); k.in(height_col); k.in(width_col); k.out(col); k.run(GET_BLOCKS(state, num_kernels), getNumThreads(state)); // Launch // im2col_kernel <<<GET_BLOCKS(num_kernels), CL_NUM_THREADS, 0, stream>>> ( // num_kernels, data_im, height, width, ksize_h, ksize_w, // pad_h, pad_w, stride_h, stride_w, // height_col, width_col, data_col // ); }