TEST(testinout, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float inout[5]; for(int i = 0; i < 5; i++) { inout[i] = i * 3; } kernel->inout(5, inout); size_t global = 5; size_t local = 5; kernel->run(1, &global, &local); assertEquals(inout[0] , 7); assertEquals(inout[1] , 10); assertEquals(inout[2] , 13); assertEquals(inout[3] , 16); assertEquals(inout[4] , 19); cout << "tests completed ok" << endl; delete kernel; delete cl; }
// this runs an entire kernel to get one value. Clearly this is going to be pretty slow, but // at least it's more or less compatible, and comparable, to how cutorch does it void THClStorage_set(THClState *state, THClStorage *self, long index, float value) { //// cout << "set size=" << self->size << " index=" << index << " value=" << value << endl; THArgCheck((index >= 0) && (index < self->size), 2, "index out of bounds"); THArgCheck(self->wrapper != 0, 1, "storage argument not initialized, is empty"); // if( self->wrapper->isDeviceDirty() ) { // we have to do this, since we're going to copy it all back again // // although I suppose we could set via a kernel perhaps // // either way, this function is pretty inefficient right now :-P // if(state->trace) cout << "wrapper->copyToHost() size " << self->size << endl; // self->wrapper->copyToHost(); // } // self->data[index] = value; // if(state->trace) cout << "wrapper->copyToDevice() size " << self->size << endl; // self->wrapper->copyToDevice(); const char *uniqueName = __FILE__ ":set"; EasyCL *cl = self->cl; // cant remember if this is a good idea or not :-P CLKernel *kernel = 0; if(cl->kernelExists(uniqueName)) { kernel = cl->getKernel(uniqueName); } else { TemplatedKernel kernelBuilder(cl); kernel = kernelBuilder.buildKernel( uniqueName, __FILE__, getSetKernelSource(), "THClStorageSet" ); } kernel->inout(self->wrapper); kernel->in((int64_t)index); kernel->in(value); kernel->run_1d(1, 1); if(state->addFinish) cl->finish(); }
TEST(testlocal, notUselocal) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "notUseLocal"); int workgroupSize = 64; float *myarray = new float[workgroupSize]; kernel->in(workgroupSize); kernel->inout(workgroupSize, myarray); kernel->run_1d(workgroupSize, workgroupSize); delete[]myarray; delete kernel; delete cl; }
TEST(testfloatarray, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float in[5]; float inout[5]; float out[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3; inout[i] = i * 3; } kernel->in(5, in); kernel->out(5, out); kernel->inout(5, inout); kernel->run_1d(5, 5); for(int i = 0; i < 5; i++) { cout << out[i] << " "; } cout << endl; for(int i = 0; i < 5; i++) { cout << inout[i] << " "; } cout << endl; assertEquals(inout[0], 7); assertEquals(inout[1] , 10); assertEquals(inout[2] , 34); assertEquals(inout[3] , 16); assertEquals(inout[4], 19); assertEquals(out[0] , 5); assertEquals(out[1] , 8); assertEquals(out[2] , 26); assertEquals(out[3] , 14); assertEquals(out[4] , 17); cout << "tests completed ok" << endl; delete kernel; delete cl; }
VIRTUAL void GpuOp::apply1_inplace( int N, CLWrapper*destinationWrapper, Op1 *op ) { StatefulTimer::instance()->timeCheck("GpuOp::apply inplace start" ); string kernelName = "GpuOp::" + op->getName() + "_inplace"; if( !cl->kernelExists( kernelName ) ) { buildKernel( kernelName, op, true ); } CLKernel *kernel = cl->getKernel( kernelName ); kernel->in( N ); kernel->inout( destinationWrapper ); int globalSize = N; int workgroupSize = 64; int numWorkgroups = ( globalSize + workgroupSize - 1 ) / workgroupSize; kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize ); cl->finish(); StatefulTimer::instance()->timeCheck("GpuOp::apply inplace end" ); }
TEST(testlocal, localreduce) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); float *myarray = new float[workgroupSize]; // Timer timer; for(int i = 0; i < 2000; i++) { float sumViaCpu = 0; for(int i = 0; i < workgroupSize; i++) { myarray[i] = (i + 7) * 3; sumViaCpu += myarray[i]; } EXPECT_NE(myarray[0], sumViaCpu); kernel->inout(workgroupSize, myarray)->localFloats(workgroupSize); kernel->run_1d(workgroupSize, workgroupSize); EXPECT_EQ(myarray[0], sumViaCpu); } // timer.timeCheck("after iterations"); delete[]myarray; delete kernel; delete cl; }