TEST( SLOW_testintwrapper_huge, testreadwrite ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_stress"); const int N = 1000000; int *in = new int[N]; for( int i = 0; i < N; i++ ) { in[i] = i * 3; } int *out = new int[N]; CLWrapper *inwrapper = cl->wrap(N, in); CLWrapper *outwrapper = cl->wrap(N, out); inwrapper->copyToDevice(); outwrapper->createOnDevice(); kernel->input( inwrapper ); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; kernel->run_1d( globalSize, workgroupsize ); outwrapper->copyToHost(); for( int i = 0; i < N; i++ ) { if( out[i] != 689514 ) { cout << "out[" << i << "] != 689514: " << out[i] << endl; exit(-1); } } delete outwrapper; delete inwrapper; delete kernel; delete cl; }
TEST( SLOW_testintwrapper_huge, testread ) { Timer timer; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_read"); // const int N = 4500000; // const int N = (4500000/512)*512; int N = 100000; int *out = new int[N]; CLWrapper *outwrapper = cl->wrap(N, out); kernel->in(3)->in(7); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; timer.timeCheck("before kernel"); kernel->run_1d( globalSize, workgroupsize ); timer.timeCheck("after kernel"); outwrapper->copyToHost(); timer.timeCheck("after copy to host"); for( int i = 0; i < N; i++ ) { if( out[i] != 4228 ) { cout << "out[" << i << "] != 4228: " << out[i] << endl; exit(-1); } } delete outwrapper; delete kernel; delete cl; }
TEST(SLOW_testlocal, selfdot_3levels_withoutscratch) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "selfdot_ints_withoutscratch"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int level3size = numWorkgroups / 4; const int N = workgroupSize * numWorkgroups * level3size; cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl; int *myarray = new int[N]; for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 5; } // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); // timer.timeCheck("copied array to device"); int *second = new int[N]; CLWrapper *secondwrapper = cl->wrap(N, second); int *a2 = new int[numWorkgroups*level3size]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups * level3size, a2); kernel->in(a1wrapper); kernel->out(secondwrapper); kernel->out(a2wrapper); kernel->run_1d(N, workgroupSize); cl->finish(); int *a3 = new int[numWorkgroups]; CLWrapper *a3wrapper = cl->wrap(level3size, a3); kernel->in(a2wrapper); kernel->out(secondwrapper); kernel->out(a3wrapper); kernel->run_1d(workgroupSize * level3size, workgroupSize); cl->finish(); int finalSum; kernel->in(a3wrapper); kernel->out(secondwrapper); kernel->out(1, &finalSum); kernel->run_1d(level3size, level3size); // timer.timeCheck("finished 3-level reduce"); EXPECT_EQ(-1306309159, finalSum); delete a1wrapper; delete a2wrapper; delete a3wrapper; delete secondwrapper; delete[] a3; delete[] second; delete[] a2; delete[]myarray; delete kernel; delete cl; }
TEST(testlocal, reduceviascratch_multipleworkgroups_ints) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups_ints"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int N = workgroupSize * numWorkgroups; cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl; int *myarray = new int[N]; int sumViaCpu = 0; int localSumViaCpu = 0; int localSumViaCpu2 = 0; int *localSumsViaCpu = new int[numWorkgroups]; memset(localSumsViaCpu, 0, sizeof(int)*numWorkgroups); for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 50; sumViaCpu += myarray[i]; if(i < workgroupSize) { localSumViaCpu += myarray[i]; } if(i >= workgroupSize && i < workgroupSize * 2) { localSumViaCpu2 += myarray[i]; } int workgroupId = i / workgroupSize; localSumsViaCpu[workgroupId] += myarray[i]; } ASSERT_EQ(localSumViaCpu, localSumsViaCpu[0]); ASSERT_EQ(localSumViaCpu2, localSumsViaCpu[1]); ASSERT_NE(myarray[0], sumViaCpu); // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); int *a2 = new int[numWorkgroups]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups, a2); kernel->in(a1wrapper); kernel->out(a2wrapper); kernel->localInts(workgroupSize); kernel->run_1d(N, workgroupSize); int finalSum; kernel->in(a2wrapper); kernel->out(1, &finalSum); kernel->localInts(workgroupSize); kernel->run_1d(numWorkgroups, workgroupSize); // timer.timeCheck("finished 2-way reduce"); EXPECT_EQ(sumViaCpu, finalSum); delete a1wrapper; delete a2wrapper; delete[] a2; delete[]myarray; delete kernel; delete cl; }
TEST( testdefines, simple ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testdefines.cl", "testDefines", "-D DOUBLE -D SOME_VALUE=5" ); float out[32]; kernel->out( 32, out ); kernel->run_1d( 32, 32 ); EXPECT_EQ( 10, out[3] ); delete kernel; delete cl; }
TEST(testlocal, notUselocal) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "notUseLocal"); int workgroupSize = 64; float *myarray = new float[workgroupSize]; kernel->in(workgroupSize); kernel->inout(workgroupSize, myarray); kernel->run_1d(workgroupSize, workgroupSize); delete[]myarray; delete kernel; delete cl; }
TEST(testdirtywrapper, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); float in[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3; } float out[5]; CLWrapper *inwrapper = cl->wrap(5, in); CLWrapper *outwrapper = cl->wrap(5, out); EXPECT_FALSE(inwrapper->isDeviceDirty()); EXPECT_FALSE(outwrapper->isDeviceDirty()); inwrapper->copyToDevice(); EXPECT_FALSE(inwrapper->isDeviceDirty()); EXPECT_FALSE(outwrapper->isDeviceDirty()); kernel->input(inwrapper); kernel->output(outwrapper); EXPECT_FALSE(inwrapper->isDeviceDirty()); EXPECT_FALSE(outwrapper->isDeviceDirty()); kernel->run_1d(5, 5); EXPECT_FALSE(inwrapper->isDeviceDirty()); EXPECT_TRUE(outwrapper->isDeviceDirty()); outwrapper->copyToHost(); EXPECT_FALSE(outwrapper->isDeviceDirty()); assertEquals(out[0] , 7); assertEquals(out[1] , 10); assertEquals(out[2] , 13); assertEquals(out[3] , 16); assertEquals(out[4] , 19); cout << "tests completed ok" << endl; delete inwrapper; delete outwrapper; delete kernel; delete cl; }
TEST(testlocal, reduceviascratch_multipleworkgroups) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int N = workgroupSize * numWorkgroups; float *myarray = new float[N]; float sumViaCpu = 0; float localSumViaCpu = 0; for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 10; sumViaCpu += myarray[i]; if(i < workgroupSize) { localSumViaCpu += myarray[i]; } } cout << "expected sum, calc'd via cpu, : " << sumViaCpu << endl; EXPECT_NE(myarray[0], sumViaCpu); // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); float *a2 = new float[numWorkgroups]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups, a2); kernel->in(a1wrapper); kernel->out(a2wrapper); kernel->localFloats(workgroupSize); kernel->run_1d(N, workgroupSize); float finalSum; kernel->in(a2wrapper); kernel->out(1, &finalSum); kernel->localFloats(workgroupSize); kernel->run_1d(numWorkgroups, workgroupSize); EXPECT_EQ(sumViaCpu, finalSum); delete a1wrapper; delete a2wrapper; delete[] a2; delete[]myarray; delete kernel; delete cl; }
TEST(testlocal, localreduce) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); float *myarray = new float[workgroupSize]; // Timer timer; for(int i = 0; i < 2000; i++) { float sumViaCpu = 0; for(int i = 0; i < workgroupSize; i++) { myarray[i] = (i + 7) * 3; sumViaCpu += myarray[i]; } EXPECT_NE(myarray[0], sumViaCpu); kernel->inout(workgroupSize, myarray)->localFloats(workgroupSize); kernel->run_1d(workgroupSize, workgroupSize); EXPECT_EQ(myarray[0], sumViaCpu); } // timer.timeCheck("after iterations"); delete[]myarray; delete kernel; delete cl; }
int main(int argc, char *argv[]) { const int test_size = 128; std::random_device rd; std::seed_seq s{ rd(), rd(), rd(), rd(), rd(), rd(), rd(), rd() }; std::mt19937 mt(s); EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kern = cl->buildKernel("test.cl", "test"); std::vector<uint32_t> inbuf(test_size*4), outbuf(test_size), compare(test_size); std::generate(inbuf.begin(), inbuf.end(), mt); std::cout << "Running CL implementation" << std::endl; kern->in(inbuf.size(), &inbuf[0]); kern->out(outbuf.size(), &outbuf[0]); size_t global_size[] = { test_size }; kern->run(1, global_size, nullptr); delete kern; std::cout << "Running local implementation" << std::endl; for (int i = 0; i < compare.size(); ++i) { compare[i] = inbuf[i] ^ inbuf[i + 1] ^ inbuf[i + 2] ^ inbuf[i + 3]; } std::cout << "Comparing CL test with local implementation" << std::endl; for (int i = 0; i < compare.size(); ++i) { if (outbuf[i] != compare[i]) { std::cout << "Error in index " << i << " " << outbuf[i] << " != " << compare[i] << std::endl; } } return 0; }