TEST( testCopyBlock, basic ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); float a[] = { 1,2,3,4, 5,6,7,8, 9,10,11,12 }; float b[10]; memset(b, 0, sizeof(float)*10); CLKernel *kernel = makeBasicKernel( cl ); kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (0<<10)|1)->in((2<<10)|3); kernel->localFloats( 2 * 3 ); kernel->run_1d(12,4); // kernel->run_1d(12,12); cl->finish(); float expected[] = { 2,3,4, 6,7,8 }; for( int i = 0; i < 2; i++ ) { for( int j = 0; j < 3; j++ ) { cout << b[i*3+j] << " "; EXPECT_EQ( expected[i*3+j], b[i*3+j] ); } cout << endl; } cout << endl; for( int i = 6; i < 10; i++ ) { cout << b[i] << " "; EXPECT_EQ( 0, b[i] ); } cout << endl; cout << endl; kernel->in( 12, a )->out( 6, b )->in( ( 3<<10)|4)->in( (1<<10)|0)->in((2<<10)|3); kernel->localFloats( 2 * 3 ); // kernel->run_1d(12,4); kernel->run_1d(12,4); cl->finish(); float expected2[] = { 5,6,7, 9,10,11 }; for( int i = 0; i < 2; i++ ) { for( int j = 0; j < 3; j++ ) { cout << b[i*3+j] << " "; EXPECT_EQ( expected2[i*3+j], b[i*3+j] ); } cout << endl; } cout << endl; for( int i = 6; i < 10; i++ ) { cout << b[i] << " "; EXPECT_EQ( 0, b[i] ); } cout << endl; cout << endl; delete kernel; delete cl; }
TEST(SLOW_testlocal, selfdot_3levels_withoutscratch) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "selfdot_ints_withoutscratch"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int level3size = numWorkgroups / 4; const int N = workgroupSize * numWorkgroups * level3size; cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl; int *myarray = new int[N]; for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 5; } // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); // timer.timeCheck("copied array to device"); int *second = new int[N]; CLWrapper *secondwrapper = cl->wrap(N, second); int *a2 = new int[numWorkgroups*level3size]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups * level3size, a2); kernel->in(a1wrapper); kernel->out(secondwrapper); kernel->out(a2wrapper); kernel->run_1d(N, workgroupSize); cl->finish(); int *a3 = new int[numWorkgroups]; CLWrapper *a3wrapper = cl->wrap(level3size, a3); kernel->in(a2wrapper); kernel->out(secondwrapper); kernel->out(a3wrapper); kernel->run_1d(workgroupSize * level3size, workgroupSize); cl->finish(); int finalSum; kernel->in(a3wrapper); kernel->out(secondwrapper); kernel->out(1, &finalSum); kernel->run_1d(level3size, level3size); // timer.timeCheck("finished 3-level reduce"); EXPECT_EQ(-1306309159, finalSum); delete a1wrapper; delete a2wrapper; delete a3wrapper; delete secondwrapper; delete[] a3; delete[] second; delete[] a2; delete[]myarray; delete kernel; delete cl; }
TEST(testqueues, defaultqueue) { cout << "start" << endl; if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float in[5]; float out[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3; out[i] = 0; } kernel->input(5, in); kernel->output(5, out); size_t global = 5; size_t local = 5; kernel->run(1, &global, &local); cl->finish(); assertEquals(out[0] , 7); assertEquals(out[1] , 10); assertEquals(out[2] , 13); assertEquals(out[3] , 16); assertEquals(out[4] , 19); for(int i = 0; i < 5; i++) { in[i] = i * 3; out[i] = 0; } cout << "cl->queue " << (void *)cl->queue << endl; cout << "*cl->queue " << (void *)*cl->queue << endl; cout << "cl->default_queue->queue" << (void *)cl->default_queue->queue << endl; kernel->input(5, in); kernel->output(5, out); kernel->run(cl->default_queue, 1, &global, &local); cl->finish(); assertEquals(out[0] , 7); assertEquals(out[1] , 10); assertEquals(out[2] , 13); assertEquals(out[3] , 16); assertEquals(out[4] , 19); cout << "tests completed ok" << endl; }
TEST( testCopyLocal, basic ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); float a[] = { 1,2,3,4, 5,6,7,8, 9,10,11,12 }; float b[16]; memset(b, 0, sizeof(float)*16); CLKernel *kernel = makeKernel( cl ); kernel->in( 12, a )->out( 16, b )->in( 12 ); kernel->localFloats( 12 ); kernel->run_1d(12,12); cl->finish(); for( int i = 0; i < 3; i++ ) { for( int j = 0; j < 4; j++ ) { cout << b[i*4+j] << " "; EXPECT_EQ( i * 4 + j + 1, b[i*4+j] ); } cout << endl; } cout << endl; for( int i = 12; i < 16; i++ ) { cout << b[i] << " "; EXPECT_EQ( 0, b[i] ); } cout << endl; delete kernel; delete cl; }
// this runs an entire kernel to get one value. Clearly this is going to be pretty slow, but // at least it's more or less compatible, and comparable, to how cutorch does it // lgfgs expects a working implementation of this method float THClStorage_get(THClState *state, const THClStorage *self, long index) { //// printf("THClStorage_get\n"); THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty"); // if( self->wrapper->isDeviceDirty() ) { // if(state->trace) cout << "wrapper->copyToHost()" << endl; // self->wrapper->copyToHost(); // } // return self->data[index]; const char *uniqueName = __FILE__ ":get"; EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getGetKernelSource(), "THClStorageGet" ); } float res; kernel->out(1, &res); kernel->in(self->wrapper); kernel->in((int64_t)index); kernel->run_1d(1, 1); if(state->addFinish) cl->finish(); return res; }
// this runs an entire kernel to get one value. Clearly this is going to be pretty slow, but // at least it's more or less compatible, and comparable, to how cutorch does it void THClStorage_set(THClState *state, THClStorage *self, long index, float value) { //// cout << "set size=" << self->size << " index=" << index << " value=" << value << endl; THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty"); // if( self->wrapper->isDeviceDirty() ) { // we have to do this, since we're going to copy it all back again // // although I suppose we could set via a kernel perhaps // // either way, this function is pretty inefficient right now :-P // if(state->trace) cout << "wrapper->copyToHost() size " << self->size << endl; // self->wrapper->copyToHost(); // } // self->data[index] = value; // if(state->trace) cout << "wrapper->copyToDevice() size " << self->size << endl; // self->wrapper->copyToDevice(); const char *uniqueName = __FILE__ ":set"; EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getSetKernelSource(), "THClStorageSet" ); } kernel->inout(self->wrapper); kernel->in((int64_t)index); kernel->in(value); kernel->run_1d(1, 1); if(state->addFinish) cl->finish(); }
TEST(testClBlas, basic) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); float A[] = {1, 3, 2, 7, 9, 5}; float B[] = {3, -1}; float C[3]; ClBlasInstance clblasInstance; CLWrapper *AWrap = cl->wrap(6, A); CLWrapper *BWrap = cl->wrap(2, B); CLWrapper *CWrap = cl->wrap(3, C); AWrap->copyToDevice(); BWrap->copyToDevice(); CWrap->createOnDevice(); ClBlasHelper::Gemm( cl, clblasRowMajor, clblasNoTrans, clblasNoTrans, 3, 2, 1, 1, AWrap, 0, BWrap, 0, 0, CWrap, 0 ); cl->finish(); CWrap->copyToHost(); EXPECT_EQ(0, C[0]); EXPECT_EQ(-1, C[1]); EXPECT_EQ(22, C[2]); cl->finish(); delete CWrap; delete BWrap; delete AWrap; cl->finish(); delete cl; clblasTeardown(); }
TEST( testMemset, basic ) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); CLKernel *kMemset = 0; // [[[cog // import stringify // stringify.write_kernel2( "kMemset", "cl/memset.cl", "cl_memset", '""' ) // ]]] // generated using cog, from cl/memset.cl: const char * kMemsetSource = "// Copyright Hugh Perkins 2015 hughperkins at gmail\n" "//\n" "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" "// obtain one at http://mozilla.org/MPL/2.0/.\n" "\n" "kernel void cl_memset(global float *target, const float value, const int N) {\n" " #define globalId get_global_id(0)\n" " if ((int)globalId < N) {\n" " target[globalId] = value;\n" " }\n" "}\n" "\n" ""; kMemset = cl->buildKernelFromString(kMemsetSource, "cl_memset", "", "cl/memset.cl"); // [[[end]]] int N = 10000; float *myArray = new float[N]; CLWrapper *myArrayWrapper = cl->wrap( N, myArray ); myArrayWrapper->createOnDevice(); kMemset->out( myArrayWrapper )->in( 99.0f )->in( N ); int workgroupSize = 64; kMemset->run_1d( ( N + workgroupSize - 1 ) / workgroupSize * workgroupSize, workgroupSize ); cl->finish(); myArrayWrapper->copyToHost(); for( int i = 0; i < 10; i++ ) { // cout << "myArray[" << i << "]=" << myArray[i] << endl; } for( int i = 0; i < N; i++ ) { EXPECT_EQ( 99.0f, myArray[i] ); } delete kMemset; delete cl; }
TEST( testCopyBlock, testPos ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); float in[12]; float res[12]; in[0] = ( 3 << 10 ) | 4; in[1] = 8; in[2] = 14; for( int i = 0; i < 3; i++ ) { cout << "in[" << i << "]=" << in[i] << endl; } CLKernel *kernel = makeTestPosKernel( cl ); kernel->in( 12, in )->out( 12, res ); kernel->run_1d(1,1); cl->finish(); for( int i = 0; i < 5; i++ ) { cout << "res[" << i << "]=" << res[i] << endl; } }
void kernelLaunch_THClTensor_reduceNoncontigDim( THClState *state, dim3 &grid, dim3 &block, int ADims, int BDims, TensorInfo<IndexType> out, TensorInfo<IndexType> in, IndexType reductionStride, IndexType reductionSize, IndexType totalSlices, float init, HasOperator2 const*modifyOp, HasOperator3 const*reduceOp) { // cl->finish(); StatefulTimer::timeCheck("Reduce-noncontig START"); std::string uniqueName = "THClTensor_reduceNoncontigDim_" + easycl::toString(ADims) + "_" + easycl::toString(BDims) + "_" + TypeParseTraits<IndexType>::name + "_" + modifyOp->operator2() + "_" + reduceOp->operator3(); EasyCL *cl = in.wrapper->getCl(); CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); StatefulTimer::timeCheck("Apply3 1aa"); } else { // launch kernel here.... TemplatedKernel kernelBuilder(cl); std::set<int> dims_set; if(ADims >= 0) { dims_set.insert(ADims); } if(BDims >= 0) { dims_set.insert(BDims); } std::vector<int> dims; for( std::set<int>::iterator it = dims_set.begin(); it != dims_set.end(); it++ ) { dims.push_back(*it); } std::string indexType = TypeParseTraits<IndexType>::name; kernelBuilder .set("include_THClReduceApplyUtils", THClReduceApplyUtils_getKernelTemplate()) .set("dims", dims) .set("dim1", ADims) .set("dim2", BDims) .set("defreduceblock", 1) .set("WarpSize", 32) // probably can do like 'if nvidia 32 else 64' ? .set("MAX_CLTORCH_DIMS", MAX_CLTORCH_DIMS) .set("IndexType", indexType) .set("modify_operation", modifyOp->operator2()) .set("reduce_operation", reduceOp->operator3()) ; kernel = kernelBuilder.buildKernel(uniqueName, "THClReduce.cl", THClReduce_getKernelSource(), "THClTensor_reduceNoncontigDim"); } THClKernels k(state, kernel); k.out(out); k.in(in); k.in((int)reductionStride); k.in((int)reductionSize); k.in((int)totalSlices); k.in(init); k.run(grid, block); if(state->addFinish) cl->finish(); StatefulTimer::timeCheck("Reduce-noncontig END"); }