TEST( testCopyBuffer, nits ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); const int N = 10; int *a = new int[N]; for( int i = 0; i < N; i++ ) { a[i] = 3 + i; } CLWrapper *aWrapper = cl->wrap( N, a ); aWrapper->copyToDevice(); memset( a, 0, sizeof(int) * N ); int *b = new int[N]; CopyBuffer::copy( cl, aWrapper, b ); for( int i = 0; i < N; i++ ) { cout << b[i] << endl; EXPECT_EQ( 3 + i, b[i] ); } memset( b, 0, sizeof(int) * N ); CopyBuffer::copy( cl, aWrapper, b ); for( int i = 0; i < N; i++ ) { cout << b[i] << endl; EXPECT_EQ( 3 + i, b[i] ); } PrintBuffer::printInts( cl, aWrapper, 10, 1 ); delete[] b; delete aWrapper; delete[] a; delete cl; }
TEST(SLOW_testlocal, reduceviascratch_multipleworkgroups_ints_3levels) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups_ints"); int workgroupSize = min(512, cl->getMaxWorkgroupSize()); const int numWorkgroups = workgroupSize; const int level3size = numWorkgroups / 4; const int N = workgroupSize * numWorkgroups * level3size; cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl; int *myarray = new int[N]; int sumViaCpu = 0; int localSumViaCpu = 0; int localSumViaCpu2 = 0; int *localSumsViaCpu = new int[numWorkgroups * level3size]; memset(localSumsViaCpu, 0, sizeof(int)*numWorkgroups* level3size); for(int i = 0; i < N; i++) { myarray[i] = ((i + 7) * 3) % 50; sumViaCpu += myarray[i]; if(i < workgroupSize) { localSumViaCpu += myarray[i]; } if(i >= workgroupSize && i < workgroupSize * 2) { localSumViaCpu2 += myarray[i]; } int workgroupId = i / workgroupSize; localSumsViaCpu[workgroupId] += myarray[i]; } ASSERT_EQ(localSumViaCpu, localSumsViaCpu[0]); ASSERT_EQ(localSumViaCpu2, localSumsViaCpu[1]); ASSERT_NE(myarray[0], sumViaCpu); // Timer timer; CLWrapper *a1wrapper = cl->wrap(N, myarray); a1wrapper->copyToDevice(); // timer.timeCheck("copied array to device"); int *a2 = new int[numWorkgroups*level3size]; CLWrapper *a2wrapper = cl->wrap(numWorkgroups * level3size, a2); kernel->in(a1wrapper); kernel->out(a2wrapper); kernel->localInts(workgroupSize); kernel->run_1d(N, workgroupSize); int *a3 = new int[numWorkgroups]; CLWrapper *a3wrapper = cl->wrap(level3size, a3); kernel->in(a2wrapper); kernel->out(a3wrapper); kernel->localInts(workgroupSize); kernel->run_1d(workgroupSize * level3size, workgroupSize); int finalSum; kernel->in(a3wrapper); kernel->out(1, &finalSum); kernel->localInts(level3size); kernel->run_1d(level3size, level3size); // timer.timeCheck("finished 3-level reduce"); EXPECT_EQ(sumViaCpu, finalSum); delete a1wrapper; delete a2wrapper; delete a3wrapper; delete[] a3; delete[] a2; delete[]myarray; delete kernel; delete cl; }
void compareSpecific(int instance0, int instance1, int numIts, int batchSize, LayerDimensions dim) { cout << "batchsize=" << batchSize << " " << dim << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); ClBlasInstance clblasInstance; int inputNumElements = dim.inputCubeSize * batchSize; int errorsSize = dim.outputCubeSize * batchSize; int weightsSize = dim.filtersSize; int errorsForUpstreamSize = dim.inputCubeSize * batchSize; float *input = new float[inputNumElements]; float *errors = new float[errorsSize]; float *weights = new float[weightsSize]; float *errorsForUpstream0 = new float[errorsForUpstreamSize]; float *errorsForUpstream1 = new float[errorsForUpstreamSize]; WeightRandomizer::randomize(0, input, inputNumElements, -0.1f, 0.1f); WeightRandomizer::randomize(1, errors, errorsSize, -0.1f, 0.1f); WeightRandomizer::randomize(2, weights, weightsSize, -0.1f, 0.1f); CLWrapper *inputWrapper = cl->wrap(inputNumElements, input); CLWrapper *errorsWrapper = cl->wrap(errorsSize, errors); CLWrapper *weightsWrapper = cl->wrap(weightsSize, weights); CLWrapper *errorsForUpstreamWrapper0 = cl->wrap(errorsForUpstreamSize, errorsForUpstream0); CLWrapper *errorsForUpstreamWrapper1 = cl->wrap(errorsForUpstreamSize, errorsForUpstream1); inputWrapper->copyToDevice(); errorsWrapper->copyToDevice(); weightsWrapper->copyToDevice(); errorsForUpstreamWrapper0->createOnDevice(); errorsForUpstreamWrapper1->createOnDevice(); Backward *bp0 = Backward::instanceSpecific(instance0, cl, dim); Backward *bp1 = Backward::instanceSpecific(instance1, cl, dim); for(int it=0; it < numIts; it++ ) { bp0->backward(batchSize, inputWrapper, errorsWrapper, weightsWrapper, errorsForUpstreamWrapper0); bp1->backward(batchSize, inputWrapper, errorsWrapper, weightsWrapper, errorsForUpstreamWrapper1); errorsForUpstreamWrapper0->copyToHost(); errorsForUpstreamWrapper1->copyToHost(); int outputNumElements = errorsForUpstreamSize; cout << dim << endl; bool same = true; for(int i = 0; i < max(20, outputNumElements); i++) { if(i < outputNumElements) { if(abs(errorsForUpstream0[i] - errorsForUpstream1[i]) < 0.000001 || abs(errorsForUpstream0[i] - errorsForUpstream1[i]) <= 0.001 * max(abs(errorsForUpstream0[i]), abs(errorsForUpstream1[i]))) { if(it == 0 && i < 20) { cout << "output[" << i << "]=" << errorsForUpstream0[i] << " " << errorsForUpstream1[i]; cout << " SAME"; } } else { cout << "output[" << i << "]=" << errorsForUpstream0[i] << " " << errorsForUpstream1[i]; cout << " DIFF"; same = false; } } else { if(it == 0 && i < 20) { cout << " "; } } if(it == 0 && i < 20) { cout << " || " << errorsForUpstream1[100+i] ; cout << " || " << errorsForUpstream1[200+i] ; cout << " || " << errorsForUpstream1[300+i] ; cout << " || " << errorsForUpstream1[400+i] ; cout << " || " << errorsForUpstream1[500+i] ; cout << " || " << errorsForUpstream1[600+i] ; cout << " || " << errorsForUpstream1[700+i] << endl; } } EXPECT_EQ(true, same); } delete inputWrapper; delete errorsWrapper; delete weightsWrapper; delete errorsForUpstreamWrapper0; delete errorsForUpstreamWrapper1; delete[] errorsForUpstream0; delete[] errorsForUpstream1; delete bp0; delete bp1; delete cl; delete[] input; delete[] errors; delete[] weights; }
void compareSpecific( CompareSpecificArgs args ) { cout << "instance0: " << args._instance0 << endl; cout << "instance1: " << args._instance1 << endl; int batchSize = args._batchSize; int numPlanes = args._numPlanes; int imageSize = args._imageSize; int poolingSize = args._poolingSize; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); PoolingForward *poolingForward0 = PoolingForward::instanceSpecific( args._instance0, cl, args._padZeros, numPlanes, imageSize, poolingSize ); PoolingForward *poolingForward1 = PoolingForward::instanceSpecific( args._instance1, cl, args._padZeros, numPlanes, imageSize, poolingSize ); const int inputSize = batchSize * numPlanes * imageSize * imageSize; int outputSize = poolingForward0->getOutputSize( batchSize ); float *input = new float[ inputSize ]; int *selectors = new int[ outputSize ]; float *output = new float[ outputSize ]; CLWrapper *inputWrapper = cl->wrap( inputSize, input ); CLWrapper *selectorsWrapper = cl->wrap( outputSize, selectors ); CLWrapper *outputWrapper = cl->wrap( outputSize, output ); WeightRandomizer::randomize( input, inputSize, -0.1f, 0.1f ); memset( selectors, 99, sizeof(int) * outputSize ); memset( output, 99, sizeof(int) * outputSize ); inputWrapper->copyToDevice(); selectorsWrapper->copyToDevice(); outputWrapper->copyToDevice(); poolingForward0->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper ); selectorsWrapper->copyToHost(); outputWrapper->copyToHost(); int *selectors0 = new int[ outputSize ]; float *output0 = new float[ outputSize ]; memcpy( selectors0, selectors, sizeof(int) * outputSize ); memcpy( output0, output, sizeof(float) * outputSize ); memset( selectors, 99, sizeof(int) * outputSize ); memset( output, 99, sizeof(int) * outputSize ); inputWrapper->copyToDevice(); selectorsWrapper->copyToDevice(); outputWrapper->copyToDevice(); poolingForward1->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper ); selectorsWrapper->copyToHost(); outputWrapper->copyToHost(); int numErrors = 0; for( int i = 0; i < outputSize; i++ ) { if( selectors[i] != selectors0[i] ) { cout << "ERROR: selectors[" << i << "] instance0:" << selectors0[i] << " != instance1:" << selectors[i] << endl; numErrors++; } if( output[i] != output0[i] ) { cout << "ERROR: output[" << i << "] instance0:" << output0[i] << " != instance1:" << output[i] << endl; numErrors++; } if( numErrors >= 10 ) { cout << "More than 10 errors. Skipping the rest :-)" << endl; break; } } EXPECT_EQ( 0, numErrors ); if( numErrors > 0 ) { int num2dPlanes = inputSize / imageSize / imageSize; for( int plane = 0; plane < num2dPlanes; plane++ ) { cout << "2dplane " << plane << ":" << endl; for( int i = 0; i < imageSize; i++ ) { string line = ""; for( int j = 0; j < imageSize; j++ ) { line += toString( input[ plane * imageSize * imageSize + i * imageSize + j] ) + " "; } cout << line << endl; } cout << endl; } } delete inputWrapper; delete selectorsWrapper; delete outputWrapper; delete poolingForward0; delete poolingForward1; delete[] selectors0; delete[] output0; delete[] selectors; delete[] output; delete[] input; delete cl; }
float THClBlas_dot(THClState *state, long n, CLWrapper *xwrapper, long xoffset, long incx, CLWrapper *ywrapper, long yoffset, long incy) { StatefulTimer::timeCheck("THClBlas_dot START"); if(n == 1) { incx = 1; incy = 1; } if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) { int i_n = (int)n; int i_incx = (int)incx; int i_incy = (int)incy; float result; cl_int err; EasyCL *cl = ywrapper->getCl(); CLWrapper *resultWrapper = cl->wrap( 1, &result ); float *scratch = new float[i_n]; CLWrapper *scratchWrapper = cl->wrap(i_n, scratch); scratchWrapper->createOnDevice(); resultWrapper->createOnDevice(); cl_event *event = 0; // if(state->addFinish) { event = new cl_event(); // } err = clblasSdot( i_n, resultWrapper->getBuffer(), 0, xwrapper->getBuffer(), xoffset, i_incx, ywrapper->getBuffer(), yoffset, i_incy, scratchWrapper->getBuffer(), 1, cl->queue, 0, NULL, event); if (err != CL_SUCCESS) { THError("clblasSdot() failed with %d", err); } else { // if(state->addFinish) { err = clWaitForEvents(1, event); if (err != CL_SUCCESS) { // throw runtime_error("clblasSger() failed with " + easycl::toString(err)); THError("clblasSger: wait for event failed with %d", err); } clReleaseEvent(*event); delete event; // } } // TODO: NOT copy to host here, use pointtensor resultWrapper->copyToHost(); //resultWrapper->markDeviceDirty(); delete resultWrapper; delete scratchWrapper; delete[] scratch; StatefulTimer::timeCheck("THClBlas_dot END"); return result; } THError("Cublas_dot only supports n, incx and incy " "upto signed integer limits: %d", INT_MAX); THError("Not implemented"); return -1; }