VIRTUAL void PoolingForward::forward(int batchSize, float *input, int *selectors, float *output) { // cout << "PoolingForward::forward(float *)" << endl; CLWrapper *inputWrapper = cl->wrap(getInputNumElements(batchSize), input); CLWrapper *selectorsWrapper = cl->wrap(getOutputNumElements(batchSize), selectors); CLWrapper *outputWrapper = cl->wrap(getOutputNumElements(batchSize), output); inputWrapper->copyToDevice(); forward(batchSize, inputWrapper, selectorsWrapper, outputWrapper); selectorsWrapper->copyToHost(); outputWrapper->copyToHost(); delete outputWrapper; delete selectorsWrapper; delete inputWrapper; }
TEST( SLOW_testintwrapper_huge, testreadwrite ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_stress"); const int N = 1000000; int *in = new int[N]; for( int i = 0; i < N; i++ ) { in[i] = i * 3; } int *out = new int[N]; CLWrapper *inwrapper = cl->wrap(N, in); CLWrapper *outwrapper = cl->wrap(N, out); inwrapper->copyToDevice(); outwrapper->createOnDevice(); kernel->input( inwrapper ); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; kernel->run_1d( globalSize, workgroupsize ); outwrapper->copyToHost(); for( int i = 0; i < N; i++ ) { if( out[i] != 689514 ) { cout << "out[" << i << "] != 689514: " << out[i] << endl; exit(-1); } } delete outwrapper; delete inwrapper; delete kernel; delete cl; }
TEST( SLOW_testintwrapper_huge, testread ) { Timer timer; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_read"); // const int N = 4500000; // const int N = (4500000/512)*512; int N = 100000; int *out = new int[N]; CLWrapper *outwrapper = cl->wrap(N, out); kernel->in(3)->in(7); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; timer.timeCheck("before kernel"); kernel->run_1d( globalSize, workgroupsize ); timer.timeCheck("after kernel"); outwrapper->copyToHost(); timer.timeCheck("after copy to host"); for( int i = 0; i < N; i++ ) { if( out[i] != 4228 ) { cout << "out[" << i << "] != 4228: " << out[i] << endl; exit(-1); } } delete outwrapper; delete kernel; delete cl; }
void CopyBuffer::copy( EasyCL *cl, CLWrapper *sourceWrapper, int *target ) { // first we will copy it to another buffer, so we can copy it out int bufferSize = sourceWrapper->size(); // float *copiedBuffer = new float[ bufferSize ]; CLWrapper *targetWrapper = cl->wrap( bufferSize, target ); targetWrapper->createOnDevice(); // now copy it, via a kernel const string kernelSource = "\n" "kernel void copy( int N, global int const *source, global int *dest ) {\n" " #define globalId ( get_global_id(0) )\n" " if( (int)globalId < N ) {\n" " dest[globalId] = source[globalId];\n" " }\n" " }\n"; CLKernel *kernel = cl->buildKernelFromString( kernelSource, "copy", "" ); kernel->in( bufferSize )->in( sourceWrapper )->out( targetWrapper ); int workgroupSize = 32; int numWorkgroups = ( bufferSize + workgroupSize - 1 ) / workgroupSize; kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize ); cl->finish(); targetWrapper->copyToHost(); delete targetWrapper; delete kernel; // delete[] copiedBuffer; }
void forwardWithWipe( Forward *prop, int batchSize, LayerDimensions dim, float *inputData, float *filters, float *biases, float *output ) { int inputDataSize = batchSize * dim.inputCubeSize; CLWrapper *dataWrapper = prop->cl->wrap( inputDataSize, inputData ); dataWrapper->copyToDevice(); int weightsSize = dim.filtersSize; CLWrapper *weightsWrapper = prop->cl->wrap( weightsSize, filters ); weightsWrapper->copyToDevice(); CLWrapper *biasWrapper = 0; if( dim.biased ) { biasWrapper = prop->cl->wrap( dim.numFilters, biases ); biasWrapper->copyToDevice(); } CLWrapper *outputWrapper = prop->cl->wrap( batchSize * dim.outputCubeSize, output ); memset( output, 99, sizeof(float) * batchSize * dim.outputCubeSize ); outputWrapper->copyToDevice(); // so we can wipe it... StatefulTimer::timeCheck("testforward: after data wrapper processing"); prop->forward( batchSize, dataWrapper, weightsWrapper, biasWrapper, outputWrapper ); // StatefulTimer::timeCheck("Forward::forward after call forward"); outputWrapper->copyToHost(); // StatefulTimer::timeCheck("Forward::forward after copytohost"); delete outputWrapper; delete dataWrapper; delete weightsWrapper; if( dim.biased ) { delete biasWrapper; } }
void propagateWithWipe( Propagate *prop, int batchSize, LayerDimensions dim, float *inputData, float *filters, float *biases, float *results ) { int inputDataSize = batchSize * dim.inputCubeSize; CLWrapper *dataWrapper = prop->cl->wrap( inputDataSize, inputData ); dataWrapper->copyToDevice(); int weightsSize = dim.filtersSize; CLWrapper *weightsWrapper = prop->cl->wrap( weightsSize, filters ); weightsWrapper->copyToDevice(); CLWrapper *biasWeightsWrapper = 0; if( dim.biased ) { biasWeightsWrapper = prop->cl->wrap( dim.numFilters, biases ); biasWeightsWrapper->copyToDevice(); } CLWrapper *resultsWrapper = prop->cl->wrap( batchSize * dim.outputCubeSize, results ); memset( results, 99, sizeof(float) * batchSize * dim.outputCubeSize ); resultsWrapper->copyToDevice(); // so we can wipe it... StatefulTimer::timeCheck("testpropagate: after data wrapper processing"); prop->propagate( batchSize, dataWrapper, weightsWrapper, biasWeightsWrapper, resultsWrapper ); // StatefulTimer::timeCheck("Propagate::propagate after call propagate"); resultsWrapper->copyToHost(); // StatefulTimer::timeCheck("Propagate::propagate after copytohost"); delete resultsWrapper; delete dataWrapper; delete weightsWrapper; if( dim.biased ) { delete biasWeightsWrapper; } }
TEST(testfloatwrapperconst, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float in[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3; } float out[5]; CLWrapper *inwrapper = cl->wrap(5, (float const *)in); CLWrapper *outwrapper = cl->wrap(5, out); inwrapper->copyToDevice(); kernel->input(inwrapper); kernel->output(outwrapper); kernel->run_1d(5, 5); outwrapper->copyToHost(); assertEquals(out[0] , 7); assertEquals(out[1] , 10); assertEquals(out[2] , 13); assertEquals(out[3] , 16); assertEquals(out[4] , 19); cout << "tests completed ok" << endl; delete inwrapper; delete outwrapper; delete kernel; delete cl; }
TEST( testpoolingforward, fromwrappers ) { int batchSize = 1; int numPlanes = 1; int imageSize = 4; int poolingSize = 2; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); PoolingForward *poolingForward = PoolingForward::instanceSpecific( 1, cl, false, numPlanes, imageSize, poolingSize ); float input[] = { 1, 2, 5, 3, 3, 8, 4, 1, 3, 33, 14,23, -1, -3.5f,37.4f,5 }; int outputSize = poolingForward->getOutputSize( batchSize ); int *selectors = new int[outputSize]; float *output = new float[outputSize]; const int inputSize = batchSize * numPlanes * imageSize * imageSize; CLWrapper *inputWrapper = cl->wrap( inputSize, input ); CLWrapper *selectorsWrapper = cl->wrap( outputSize, selectors ); CLWrapper *outputWrapper = cl->wrap( outputSize, output ); inputWrapper->copyToDevice(); poolingForward->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper ); selectorsWrapper->copyToHost(); outputWrapper->copyToHost(); EXPECT_EQ( selectors[0], 3 ); EXPECT_EQ( selectors[1], 0 ); EXPECT_EQ( selectors[2], 1 ); EXPECT_EQ( selectors[3], 2 ); EXPECT_EQ( output[0], 8 ); EXPECT_EQ( output[1], 5 ); EXPECT_EQ( output[2], 33 ); EXPECT_EQ( output[3], 37.4f ); delete inputWrapper; delete selectorsWrapper; delete outputWrapper; delete poolingForward; delete[] selectors; delete[] output; delete cl; }
TEST(testcopybuffer, withoffset) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); float in[10]; float in2[10]; for(int i = 0; i < 10; i++) { in[i] = i * 3; in2[i] = 23 + i; } float out[10]; CLWrapper *inwrapper = cl->wrap(10, in); CLWrapper *in2wrapper = cl->wrap(10, in2); CLWrapper *outwrapper = cl->wrap(10, out); inwrapper->copyToDevice(); in2wrapper->copyToDevice(); EXPECT_FALSE(in2wrapper->isDeviceDirty()); inwrapper->copyTo(in2wrapper, 2, 5, 4); EXPECT_TRUE(in2wrapper->isDeviceDirty()); // cl->finish(); // check that in2 host-side unchanged: for(int i = 0; i < 10; i++) { in[i] = i * 3; EXPECT_EQ(23 + i, in2[i]); } in2wrapper->copyToHost(); // check that in2 is now a partial copy of in: for(int i = 0; i < 10; i++) { // in[i] = i * 3; if(i >= 5 && i < 9) { EXPECT_EQ((i-3) * 3, in2[i]); } else { EXPECT_EQ(23 + i, in2[i]); } } // check that modifying in2 doesnt modfiy in: in2[1] = 27; in2wrapper->copyToDevice(); inwrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); in2wrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); EXPECT_EQ(27, in2[1]); delete inwrapper; delete in2wrapper; delete outwrapper; delete cl; }
VIRTUAL void BackpropWeights2::backpropWeights( int batchSize, float learningRate, float *derivLossBySum, float *inputData, float *filters, float *biasWeights ) { StatefulTimer::timeCheck("BackpropWeights2::backprop begin"); // const float learningMultiplier = learningRate / batchSize / sqrt( dim.outputImageSize * dim.outputImageSize ); int resultsSize = batchSize * dim.outputCubeSize; CLWrapper *derivLossBySumWrapper = cl->wrap( resultsSize, derivLossBySum ); derivLossBySumWrapper->copyToDevice(); int inputSize = batchSize * dim.inputCubeSize; CLWrapper *inputDataWrapper = cl->wrap( inputSize, inputData ); inputDataWrapper->copyToDevice(); CLWrapper *weightsWrapper = 0; int weightsSize = debug ? std::max(10000, dim.filtersSize ) : dim.filtersSize; weightsWrapper = cl->wrap( weightsSize, filters ); weightsWrapper->copyToDevice(); // cout << "backpropweights2::backpropweights resultsSize=" << resultsSize << " inputSize=" << inputSize << // " weightSize=" << weightsSize << endl; CLWrapper *biasWeightsWrapper = 0; if( dim.biased ) { biasWeightsWrapper = cl->wrap( dim.numFilters, biasWeights ); biasWeightsWrapper->copyToDevice(); } StatefulTimer::timeCheck("BackpropWeights2::backprop after copied to device"); backpropWeights( batchSize, learningRate, derivLossBySumWrapper, inputDataWrapper, weightsWrapper, biasWeightsWrapper ); StatefulTimer::timeCheck("BackpropWeights2::backprop after call backprop"); weightsWrapper->copyToHost(); if( dim.biased ) { biasWeightsWrapper->copyToHost(); } StatefulTimer::timeCheck("BackpropWeights2::backprop after copytohost"); delete derivLossBySumWrapper; delete inputDataWrapper; delete weightsWrapper; if( dim.biased ) { delete biasWeightsWrapper; } }
TEST(testcopybuffer, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); float in[5]; float in2[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3.0f; in2[i] = 23.0f + i; } float out[5]; CLWrapper *inwrapper = cl->wrap(5, in); CLWrapper *in2wrapper = cl->wrap(5, in2); CLWrapper *outwrapper = cl->wrap(5, out); inwrapper->copyToDevice(); in2wrapper->copyToDevice(); EXPECT_FALSE(in2wrapper->isDeviceDirty()); inwrapper->copyTo(in2wrapper); EXPECT_TRUE(in2wrapper->isDeviceDirty()); // cl->finish(); // check that in2 host-side unchanged: for(int i = 0; i < 5; i++) { in[i] = i * 3.0f; EXPECT_EQ(23.0f + i, in2[i]); } in2wrapper->copyToHost(); // check that in2 is now a copy of in: for(int i = 0; i < 5; i++) { in[i] = i * 3.0f; EXPECT_EQ(i * 3.0f, in2[i]); } // check that modifying in2 doesnt modfiy in: in2[1] = 27; in2wrapper->copyToDevice(); inwrapper->copyToHost(); EXPECT_EQ(1 * 3.0f, in[1]); in2wrapper->copyToHost(); EXPECT_EQ(1 * 3.0f, in[1]); EXPECT_EQ(27.0f, in2[1]); delete inwrapper; delete in2wrapper; delete outwrapper; delete cl; }
VIRTUAL void PoolingBackprop::backpropErrors( int batchSize, float *errors, int *selectors, float *errorsForUpstream ) { // cout << "PoolingBackprop::backpropErrors( float * )" << endl; StatefulTimer::instance()->timeCheck("PoolingBackprop::backpropErrors float->wrapper start" ); CLWrapper *errorsWrapper = cl->wrap( getResultsSize(batchSize), errors ); CLWrapper *selectorsWrapper = cl->wrap( getResultsSize(batchSize), selectors ); CLWrapper *errorsForUpstreamWrapper = cl->wrap( getInputSize(batchSize), errorsForUpstream ); errorsWrapper->copyToDevice(); selectorsWrapper->copyToDevice(); backpropErrors( batchSize, errorsWrapper, selectorsWrapper, errorsForUpstreamWrapper ); selectorsWrapper->copyToHost(); errorsForUpstreamWrapper->copyToHost(); delete errorsWrapper; delete selectorsWrapper; delete errorsForUpstreamWrapper; StatefulTimer::instance()->timeCheck("PoolingBackprop::backpropErrors float->wrapper end" ); }
VIRTUAL void ActivationPropagate::propagate( int batchSize, float *input, float *output ) { // cout << "ActivationPropagate::propagate( float * )" << endl; CLWrapper *inputWrapper = cl->wrap( getInputSize( batchSize ), input ); CLWrapper *outputWrapper = cl->wrap( getResultsSize( batchSize ), output ); inputWrapper->copyToDevice(); propagate( batchSize, inputWrapper, outputWrapper ); outputWrapper->copyToHost(); delete outputWrapper; delete inputWrapper; }
TEST(SLOW_testcopybuffer, larger) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); const int bufferSize = 100 * 1024 * 1024 / 4; // 100MB (of floats) float *in = new float[bufferSize]; float *in2 = new float[bufferSize]; for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; in2[i] = 23 + i; } CLWrapper *inwrapper = cl->wrap(bufferSize, in); CLWrapper *in2wrapper = cl->wrap(bufferSize, in2); inwrapper->copyToDevice(); in2wrapper->copyToDevice(); inwrapper->copyTo(in2wrapper); // cl->finish(); // check that in2 host-side unchanged: for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; EXPECT_EQ(23 + i, in2[i]); } in2wrapper->copyToHost(); // check that in2 is now a copy of in: for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; EXPECT_EQ(i * 3, in2[i]); } // check that modifying in2 doesnt modfiy in: in2[1] = 27; in2wrapper->copyToDevice(); inwrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); in2wrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); EXPECT_EQ(27, in2[1]); delete inwrapper; delete in2wrapper; delete[] in; delete[] in2; delete cl; }
VIRTUAL void ConvolutionalLayer::backProp( float learningRate ) { // Timer timer; StatefulTimer::instance()->timeCheck("backprop(): start, layer " + toString( layerIndex ) ); CLWrapper *biasWeightsWrapper = 0; if( dim.biased ) { biasWeightsWrapper = cl->wrap( getBiasWeightsSize(), biasWeights ); biasWeightsWrapper->copyToDevice(); } CLWrapper *imagesWrapper = 0; if( previousLayer->hasResultsWrapper() ) { imagesWrapper = previousLayer->getResultsWrapper(); } else { imagesWrapper = cl->wrap( previousLayer->getResultsSize(), previousLayer->getResults() ); imagesWrapper->copyToDevice(); } CLWrapper *errorsWrapper = 0; bool weOwnErrorsWrapper = false; if( nextLayer->providesErrorsForUpstreamWrapper() ) { errorsWrapper = nextLayer->getErrorsForUpstreamWrapper(); } else { errorsWrapper = cl->wrap( getResultsSize(), nextLayer->getErrorsForUpstream() ); errorsWrapper->copyToDevice(); // int resultsSize = getResultsSize(); // for( int i = 0; i < resultsSize; i++ ) { // cout << "convolutional::backproperrors errorsfromupstream[" << i << "]=" << nextLayer->getErrorsForUpstream()[i] << endl; // } weOwnErrorsWrapper = true; } if( previousLayer->needsBackProp() ) { backpropErrorsImpl->backpropErrors( batchSize, imagesWrapper, errorsWrapper, weightsWrapper, errorsForUpstreamWrapper ); StatefulTimer::instance()->timeCheck("backproperrors(): calced errors for upstream, layer " + ::toString( layerIndex ) ); } backpropWeightsImpl->backpropWeights( batchSize, learningRate, errorsWrapper, imagesWrapper, weightsWrapper, biasWeightsWrapper ); weightsCopiedToHost = false; StatefulTimer::instance()->timeCheck("backproperrors(): done weight backprop, layer " + ::toString( layerIndex ) ); if( dim.biased ) { biasWeightsWrapper->copyToHost(); delete biasWeightsWrapper; } if( !previousLayer->hasResultsWrapper() ) { delete imagesWrapper; } if( weOwnErrorsWrapper ) { delete errorsWrapper; } StatefulTimer::instance()->timeCheck("backproperrors(): updated weights, layer " + ::toString( layerIndex ) ); }
TEST(testClBlas, colMajor2) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); float A[] = {1, 3, 2, 7, 9, 5, 0, -2}; float B[] = {3,2,8, -1,0,4}; float C[4*3]; transpose(A, 4, 2); transpose(B, 2, 3); // for(int row=0; row < 2; row++) { // for(int col=0; col < 1; col++) { // cout << B[row*1 + col] << " "; // } // cout << endl; // } ClBlasInstance clblasInstance; // ClBlasInstance::initializeIfNecessary(); CLWrapper *AWrap = cl->wrap(4*2, A); CLWrapper *BWrap = cl->wrap(2*3, B); CLWrapper *CWrap = cl->wrap(4*3, C); AWrap->copyToDevice(); BWrap->copyToDevice(); ClBlasHelper::Gemm( cl, clblasColumnMajor, clblasNoTrans, clblasNoTrans, 4, 2, 3, 1, AWrap, 0, BWrap, 0, 0, CWrap, 0 ); // cl->finish(); CWrap->copyToHost(); transpose(C, 3, 4); EXPECT_EQ(1*3-1*3, C[0]); EXPECT_EQ(1*2+3*0, C[1]); EXPECT_EQ(1*8+4*3, C[2]); EXPECT_EQ(-8, C[11]); delete CWrap; delete BWrap; delete AWrap; delete cl; }
TEST(testClBlas, colMajorTransB) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); float A[] = {1, 3, 2, 7, 9, 5}; float B[] = {3, -1}; float C[3]; transpose(A, 3, 2); // transpose(B, 2, 1); // for(int row=0; row < 2; row++) { // for(int col=0; col < 1; col++) { // cout << B[row*1 + col] << " "; // } // cout << endl; // } ClBlasInstance clblasInstance; // ClBlasInstance::initializeIfNecessary(); CLWrapper *AWrap = cl->wrap(6, A); CLWrapper *BWrap = cl->wrap(2, B); CLWrapper *CWrap = cl->wrap(3, C); AWrap->copyToDevice(); BWrap->copyToDevice(); ClBlasHelper::Gemm( cl, clblasColumnMajor, clblasNoTrans, clblasTrans, 3, 2, 1, 1, AWrap, 0, BWrap, 0, 0, CWrap, 0 ); // cl->finish(); CWrap->copyToHost(); transpose(C, 1, 3); EXPECT_EQ(0, C[0]); EXPECT_EQ(-1, C[1]); EXPECT_EQ(22, C[2]); delete CWrap; delete BWrap; delete AWrap; delete cl; }
TEST( testMemset, basic ) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); CLKernel *kMemset = 0; // [[[cog // import stringify // stringify.write_kernel2( "kMemset", "cl/memset.cl", "cl_memset", '""' ) // ]]] // generated using cog, from cl/memset.cl: const char * kMemsetSource = "// Copyright Hugh Perkins 2015 hughperkins at gmail\n" "//\n" "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" "// obtain one at http://mozilla.org/MPL/2.0/.\n" "\n" "kernel void cl_memset(global float *target, const float value, const int N) {\n" " #define globalId get_global_id(0)\n" " if ((int)globalId < N) {\n" " target[globalId] = value;\n" " }\n" "}\n" "\n" ""; kMemset = cl->buildKernelFromString(kMemsetSource, "cl_memset", "", "cl/memset.cl"); // [[[end]]] int N = 10000; float *myArray = new float[N]; CLWrapper *myArrayWrapper = cl->wrap( N, myArray ); myArrayWrapper->createOnDevice(); kMemset->out( myArrayWrapper )->in( 99.0f )->in( N ); int workgroupSize = 64; kMemset->run_1d( ( N + workgroupSize - 1 ) / workgroupSize * workgroupSize, workgroupSize ); cl->finish(); myArrayWrapper->copyToHost(); for( int i = 0; i < 10; i++ ) { // cout << "myArray[" << i << "]=" << myArray[i] << endl; } for( int i = 0; i < N; i++ ) { EXPECT_EQ( 99.0f, myArray[i] ); } delete kMemset; delete cl; }
TEST(testClBlas, basic) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); float A[] = {1, 3, 2, 7, 9, 5}; float B[] = {3, -1}; float C[3]; ClBlasInstance clblasInstance; CLWrapper *AWrap = cl->wrap(6, A); CLWrapper *BWrap = cl->wrap(2, B); CLWrapper *CWrap = cl->wrap(3, C); AWrap->copyToDevice(); BWrap->copyToDevice(); CWrap->createOnDevice(); ClBlasHelper::Gemm( cl, clblasRowMajor, clblasNoTrans, clblasNoTrans, 3, 2, 1, 1, AWrap, 0, BWrap, 0, 0, CWrap, 0 ); cl->finish(); CWrap->copyToHost(); EXPECT_EQ(0, C[0]); EXPECT_EQ(-1, C[1]); EXPECT_EQ(22, C[2]); cl->finish(); delete CWrap; delete BWrap; delete AWrap; cl->finish(); delete cl; clblasTeardown(); }
// must allocate output yourself before the call VIRTUAL void Forward::forward( int batchSize, float *inputData, float *filters, float *biases, float *output ) { StatefulTimer::timeCheck("Forward::forward begin"); int inputDataSize = batchSize * dim.inputCubeSize; CLWrapper *dataWrapper = cl->wrap( inputDataSize, inputData ); dataWrapper->copyToDevice(); int weightsSize = dim.filtersSize; CLWrapper *weightsWrapper = cl->wrap( weightsSize, filters ); weightsWrapper->copyToDevice(); CLWrapper *biasWrapper = 0; if( dim.biased ) { int biasWrapperSize = dim.numFilters; biasWrapper = cl->wrap( biasWrapperSize, biases ); biasWrapper->copyToDevice(); } // int outputDataSize = batchSize * dim.outputCubeSize; // cout << " batchsize " << batchSize << " " << dim << endl; // int allocatedOutputSize = std::max(5000, outputDataSize ); // int allocatedOutputSize = outputDataSize; // float *output = new float[allocatedOutputSize]; CLWrapper *outputWrapper = cl->wrap( batchSize * dim.outputCubeSize, output ); cl->finish(); StatefulTimer::timeCheck("Forward::forward after copied to device"); forward( batchSize, dataWrapper, weightsWrapper, biasWrapper, outputWrapper ); StatefulTimer::timeCheck("Forward::forward after call forward"); outputWrapper->copyToHost(); StatefulTimer::timeCheck("Forward::forward after copytohost"); // for( int i = 0; i < 20; i++ ) { // cout << "output[" << i << "]=" << output[i] << endl; // } delete outputWrapper; delete dataWrapper; delete weightsWrapper; if( dim.biased ) { delete biasWrapper; } // return output; }
void compareSpecific( CompareSpecificArgs args ) { cout << "instance0: " << args._instance0 << endl; cout << "instance1: " << args._instance1 << endl; int batchSize = args._batchSize; int numPlanes = args._numPlanes; int imageSize = args._imageSize; int poolingSize = args._poolingSize; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); PoolingForward *poolingForward0 = PoolingForward::instanceSpecific( args._instance0, cl, args._padZeros, numPlanes, imageSize, poolingSize ); PoolingForward *poolingForward1 = PoolingForward::instanceSpecific( args._instance1, cl, args._padZeros, numPlanes, imageSize, poolingSize ); const int inputSize = batchSize * numPlanes * imageSize * imageSize; int outputSize = poolingForward0->getOutputSize( batchSize ); float *input = new float[ inputSize ]; int *selectors = new int[ outputSize ]; float *output = new float[ outputSize ]; CLWrapper *inputWrapper = cl->wrap( inputSize, input ); CLWrapper *selectorsWrapper = cl->wrap( outputSize, selectors ); CLWrapper *outputWrapper = cl->wrap( outputSize, output ); WeightRandomizer::randomize( input, inputSize, -0.1f, 0.1f ); memset( selectors, 99, sizeof(int) * outputSize ); memset( output, 99, sizeof(int) * outputSize ); inputWrapper->copyToDevice(); selectorsWrapper->copyToDevice(); outputWrapper->copyToDevice(); poolingForward0->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper ); selectorsWrapper->copyToHost(); outputWrapper->copyToHost(); int *selectors0 = new int[ outputSize ]; float *output0 = new float[ outputSize ]; memcpy( selectors0, selectors, sizeof(int) * outputSize ); memcpy( output0, output, sizeof(float) * outputSize ); memset( selectors, 99, sizeof(int) * outputSize ); memset( output, 99, sizeof(int) * outputSize ); inputWrapper->copyToDevice(); selectorsWrapper->copyToDevice(); outputWrapper->copyToDevice(); poolingForward1->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper ); selectorsWrapper->copyToHost(); outputWrapper->copyToHost(); int numErrors = 0; for( int i = 0; i < outputSize; i++ ) { if( selectors[i] != selectors0[i] ) { cout << "ERROR: selectors[" << i << "] instance0:" << selectors0[i] << " != instance1:" << selectors[i] << endl; numErrors++; } if( output[i] != output0[i] ) { cout << "ERROR: output[" << i << "] instance0:" << output0[i] << " != instance1:" << output[i] << endl; numErrors++; } if( numErrors >= 10 ) { cout << "More than 10 errors. Skipping the rest :-)" << endl; break; } } EXPECT_EQ( 0, numErrors ); if( numErrors > 0 ) { int num2dPlanes = inputSize / imageSize / imageSize; for( int plane = 0; plane < num2dPlanes; plane++ ) { cout << "2dplane " << plane << ":" << endl; for( int i = 0; i < imageSize; i++ ) { string line = ""; for( int j = 0; j < imageSize; j++ ) { line += toString( input[ plane * imageSize * imageSize + i * imageSize + j] ) + " "; } cout << line << endl; } cout << endl; } } delete inputWrapper; delete selectorsWrapper; delete outputWrapper; delete poolingForward0; delete poolingForward1; delete[] selectors0; delete[] output0; delete[] selectors; delete[] output; delete[] input; delete cl; }
float THClBlas_dot(THClState *state, long n, CLWrapper *xwrapper, long xoffset, long incx, CLWrapper *ywrapper, long yoffset, long incy) { StatefulTimer::timeCheck("THClBlas_dot START"); if(n == 1) { incx = 1; incy = 1; } if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) { int i_n = (int)n; int i_incx = (int)incx; int i_incy = (int)incy; float result; cl_int err; EasyCL *cl = ywrapper->getCl(); CLWrapper *resultWrapper = cl->wrap( 1, &result ); float *scratch = new float[i_n]; CLWrapper *scratchWrapper = cl->wrap(i_n, scratch); scratchWrapper->createOnDevice(); resultWrapper->createOnDevice(); cl_event *event = 0; // if(state->addFinish) { event = new cl_event(); // } err = clblasSdot( i_n, resultWrapper->getBuffer(), 0, xwrapper->getBuffer(), xoffset, i_incx, ywrapper->getBuffer(), yoffset, i_incy, scratchWrapper->getBuffer(), 1, cl->queue, 0, NULL, event); if (err != CL_SUCCESS) { THError("clblasSdot() failed with %d", err); } else { // if(state->addFinish) { err = clWaitForEvents(1, event); if (err != CL_SUCCESS) { // throw runtime_error("clblasSger() failed with " + easycl::toString(err)); THError("clblasSger: wait for event failed with %d", err); } clReleaseEvent(*event); delete event; // } } // TODO: NOT copy to host here, use pointtensor resultWrapper->copyToHost(); //resultWrapper->markDeviceDirty(); delete resultWrapper; delete scratchWrapper; delete[] scratch; StatefulTimer::timeCheck("THClBlas_dot END"); return result; } THError("Cublas_dot only supports n, incx and incy " "upto signed integer limits: %d", INT_MAX); THError("Not implemented"); return -1; }
float THClBlas_dot(THClState *state, long n, CLWrapper *xwrapper, long xoffset, long incx, CLWrapper *ywrapper, long yoffset, long incy) { if(n == 1) { incx = 1; incy = 1; } if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) { int i_n = (int)n; int i_incx = (int)incx; int i_incy = (int)incy; float result; cl_int err; err = clblasSetup(); if (err != CL_SUCCESS) { THError("clblasSetup() failed with %d", err); } CLWrapper *resultWrapper = state->cl->wrap( 1, &result ); float *scratch = new float[i_n]; CLWrapper *scratchWrapper = state->cl->wrap(i_n, scratch); scratchWrapper->createOnDevice(); resultWrapper->createOnDevice(); cl_event event = NULL; err = clblasSdot( i_n, resultWrapper->getBuffer(), 0, xwrapper->getBuffer(), xoffset, i_incx, ywrapper->getBuffer(), yoffset, i_incy, scratchWrapper->getBuffer(), 1, state->cl->queue, 0, NULL, &event); // THCublasCheck(cublasSdot(*state->blasState->current_handle, i_n, x, i_incx, y, i_incy, &result)); // ClDeviceSynchronize(); if (err != CL_SUCCESS) { THError("clblasSdot() failed with %d", err); } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); } resultWrapper->copyToHost(); /* Finalize work with clblas. */ clblasTeardown(); delete resultWrapper; delete scratchWrapper; delete[] scratch; return result; } THError("Cublas_dot only supports n, incx and incy " "upto signed integer limits: %d", INT_MAX); THError("Not implemented"); return -1; }