TEST(testcopybuffer, throwsifnotondevice) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); const int bufferSize = 100 * 1024 / 4; float *in = new float[bufferSize]; float *in2 = new float[bufferSize]; for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; in2[i] = 23 + i; } CLWrapper *inwrapper = cl->wrap(bufferSize, in); CLWrapper *in2wrapper = cl->wrap(bufferSize, in2); inwrapper->copyToDevice(); // in2wrapper->copyToDevice(); bool threw = false; try { inwrapper->copyTo(in2wrapper); } catch(runtime_error &e) { threw = true; } EXPECT_TRUE(threw); delete cl; }
void CopyBuffer::copy( EasyCL *cl, CLWrapper *sourceWrapper, int *target ) { // first we will copy it to another buffer, so we can copy it out int bufferSize = sourceWrapper->size(); // float *copiedBuffer = new float[ bufferSize ]; CLWrapper *targetWrapper = cl->wrap( bufferSize, target ); targetWrapper->createOnDevice(); // now copy it, via a kernel const string kernelSource = "\n" "kernel void copy( int N, global int const *source, global int *dest ) {\n" " #define globalId ( get_global_id(0) )\n" " if( (int)globalId < N ) {\n" " dest[globalId] = source[globalId];\n" " }\n" " }\n"; CLKernel *kernel = cl->buildKernelFromString( kernelSource, "copy", "" ); kernel->in( bufferSize )->in( sourceWrapper )->out( targetWrapper ); int workgroupSize = 32; int numWorkgroups = ( bufferSize + workgroupSize - 1 ) / workgroupSize; kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize ); cl->finish(); targetWrapper->copyToHost(); delete targetWrapper; delete kernel; // delete[] copiedBuffer; }
TEST( SLOW_testintwrapper_huge, testreadwrite ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_stress"); const int N = 1000000; int *in = new int[N]; for( int i = 0; i < N; i++ ) { in[i] = i * 3; } int *out = new int[N]; CLWrapper *inwrapper = cl->wrap(N, in); CLWrapper *outwrapper = cl->wrap(N, out); inwrapper->copyToDevice(); outwrapper->createOnDevice(); kernel->input( inwrapper ); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; kernel->run_1d( globalSize, workgroupsize ); outwrapper->copyToHost(); for( int i = 0; i < N; i++ ) { if( out[i] != 689514 ) { cout << "out[" << i << "] != 689514: " << out[i] << endl; exit(-1); } } delete outwrapper; delete inwrapper; delete kernel; delete cl; }
THClStorage* THClStorage_newWithSize(THClState *state, int device, long size) { THArgCheck(size >= 0, 2, "invalid size"); if(size > 0) { StatefulTimer::timeCheck("THClStorage_newWithSize START"); THClStorage *storage = (THClStorage*)THAlloc(sizeof(THClStorage)); float *data = new float[size]; storage->device = device; storage->cl = THClState_getClv2(state, storage->device); CLWrapper *wrapper = storage->cl->wrap( size, data ); if(state->trace) cout << "new wrapper, size " << size << endl; if(state->trace) cout << "wrapper->createOnDevice()" << endl; wrapper->createOnDevice(); storage->data = data; storage->wrapper = wrapper; storage->size = size; storage->refcount = 1; storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM; StatefulTimer::timeCheck("THClStorage_newWithSize END"); return storage; } else { return THClStorage_newv2(state, device); } }
VIRTUAL void ActivationLayer::backward() { // have no weights to backprop to, just need to backprop the errors // CLWrapper *imagesWrapper = 0; // if( previousLayer->hasOutputWrapper() ) { // imagesWrapper = previousLayer->getOutputWrapper(); // } else { // imagesWrapper = cl->wrap( previousLayer->getOutputSize(), previousLayer->getOutput() ); // imagesWrapper->copyToDevice(); // } CLWrapper *gradOutputWrapper = 0; bool weOwnGradOutputWrapper = false; if( nextLayer->providesGradInputWrapper() ) { gradOutputWrapper = nextLayer->getGradInputWrapper(); } else { gradOutputWrapper = cl->wrap( getOutputSize(), nextLayer->getGradInput() ); gradOutputWrapper->copyToDevice(); weOwnGradOutputWrapper = true; } activationBackpropImpl->backward( batchSize, outputWrapper, gradOutputWrapper, gradInputWrapper ); // gradInputCopiedToHost = false; // if( !previousLayer->hasOutputWrapper() ) { // delete imagesWrapper; // } if( weOwnGradOutputWrapper ) { delete gradOutputWrapper; } }
TEST(testfloatwrapperconst, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", ""); float in[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3; } float out[5]; CLWrapper *inwrapper = cl->wrap(5, (float const *)in); CLWrapper *outwrapper = cl->wrap(5, out); inwrapper->copyToDevice(); kernel->input(inwrapper); kernel->output(outwrapper); kernel->run_1d(5, 5); outwrapper->copyToHost(); assertEquals(out[0] , 7); assertEquals(out[1] , 10); assertEquals(out[2] , 13); assertEquals(out[3] , 16); assertEquals(out[4] , 19); cout << "tests completed ok" << endl; delete inwrapper; delete outwrapper; delete kernel; delete cl; }
VIRTUAL void Adagrad::updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper, AdagradState *trainerState) { int numWeights = trainerState->numWeights; float *working = new float[ numWeights ]; CLWrapper *workingWrapper = cl->wrap(numWeights, working); workingWrapper->createOnDevice(); CLMathWrapper clWeights(weightsWrapper); CLMathWrapper clGradWeights(gradWeightsWrapper); CLMathWrapper clSumSquares(trainerState->sumSquaresWrapper); CLMathWrapper clWorking(workingWrapper); // following all happens on gpu, via clmathwrapper: clWorking = clGradWeights; clWorking.squared(); clSumSquares += clWorking; clWorking = clSumSquares; clWorking.sqrt(); clWorking.inv(); clWorking *= clGradWeights; clWorking *= - learningRate; clWeights += clWorking; delete workingWrapper; delete[] working; }
VIRTUAL void PoolingLayer::forward() { CLWrapper *upstreamOutputWrapper = 0; if( previousLayer->hasOutputWrapper() ) { upstreamOutputWrapper = previousLayer->getOutputWrapper(); } else { float *upstreamOutput = previousLayer->getOutput(); upstreamOutputWrapper = cl->wrap( previousLayer->getOutputSize(), upstreamOutput ); upstreamOutputWrapper->copyToDevice(); } poolingForwardImpl->forward( batchSize, upstreamOutputWrapper, selectorsWrapper, outputWrapper ); if( !previousLayer->hasOutputWrapper() ) { delete upstreamOutputWrapper; } // cout << "PoolingLayer::forward() selectors after forward: " << endl; // for( int i = 0; i < outputImageSize; i++ ) { // for( int j = 0; j < outputImageSize; j++ ) { // cout << selectors[ i * outputImageSize + j ] << " "; // } // cout << endl; // } // cout << "PoolingLayer::forward() selectorsWrapper after forward: " << endl; // PrintBuffer::printInts( cl, selectorsWrapper, outputImageSize, outputImageSize ); }
TEST( SLOW_testintwrapper_huge, testread ) { Timer timer; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_read"); // const int N = 4500000; // const int N = (4500000/512)*512; int N = 100000; int *out = new int[N]; CLWrapper *outwrapper = cl->wrap(N, out); kernel->in(3)->in(7); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; timer.timeCheck("before kernel"); kernel->run_1d( globalSize, workgroupsize ); timer.timeCheck("after kernel"); outwrapper->copyToHost(); timer.timeCheck("after copy to host"); for( int i = 0; i < N; i++ ) { if( out[i] != 4228 ) { cout << "out[" << i << "] != 4228: " << out[i] << endl; exit(-1); } } delete outwrapper; delete kernel; delete cl; }
TEST(testcopybuffer, withoffset) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); float in[10]; float in2[10]; for(int i = 0; i < 10; i++) { in[i] = i * 3; in2[i] = 23 + i; } float out[10]; CLWrapper *inwrapper = cl->wrap(10, in); CLWrapper *in2wrapper = cl->wrap(10, in2); CLWrapper *outwrapper = cl->wrap(10, out); inwrapper->copyToDevice(); in2wrapper->copyToDevice(); EXPECT_FALSE(in2wrapper->isDeviceDirty()); inwrapper->copyTo(in2wrapper, 2, 5, 4); EXPECT_TRUE(in2wrapper->isDeviceDirty()); // cl->finish(); // check that in2 host-side unchanged: for(int i = 0; i < 10; i++) { in[i] = i * 3; EXPECT_EQ(23 + i, in2[i]); } in2wrapper->copyToHost(); // check that in2 is now a partial copy of in: for(int i = 0; i < 10; i++) { // in[i] = i * 3; if(i >= 5 && i < 9) { EXPECT_EQ((i-3) * 3, in2[i]); } else { EXPECT_EQ(23 + i, in2[i]); } } // check that modifying in2 doesnt modfiy in: in2[1] = 27; in2wrapper->copyToDevice(); inwrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); in2wrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); EXPECT_EQ(27, in2[1]); delete inwrapper; delete in2wrapper; delete outwrapper; delete cl; }
VIRTUAL void PoolingLayer::backward() { // have no weights to backprop to, just need to backprop the errors CLWrapper *gradOutputWrapper = 0; bool weOwnErrorsWrapper = false; if( nextLayer->providesGradInputWrapper() ) { gradOutputWrapper = nextLayer->getGradInputWrapper(); } else { gradOutputWrapper = cl->wrap( getOutputSize(), nextLayer->getGradInput() ); gradOutputWrapper->copyToDevice(); weOwnErrorsWrapper = true; } // cout << "PoolingLayer::backward selectorsWrapper:" << endl; // PrintBuffer::printInts( cl, selectorsWrapper, outputImageSize, outputImageSize ); // int *selectors = reinterpret_cast< int * >( selectorsWrapper->getHostArray() ); // cout << "PoolingLayer::backward selectors before copy to host:" << endl; // for( int i = 0; i < outputImageSize; i++ ) { // for( int j = 0; j < outputImageSize; j++ ) { // cout << " " << selectors[i * outputImageSize + j]; // } // cout << endl; // } // selectorsWrapper->copyToHost(); // cout << "PoolingLayer::backward selectors after copy to host:" << endl; // for( int i = 0; i < outputImageSize; i++ ) { // for( int j = 0; j < outputImageSize; j++ ) { // cout << " " << selectors[i * outputImageSize + j]; // } // cout << endl; // } // selectorsWrapper->copyToDevice(); // selectorsWrapper->copyToHost(); poolingBackpropImpl->backward( batchSize, gradOutputWrapper, selectorsWrapper, gradInputWrapper ); // gradInputWrapper->copyToHost(); // float *gradInput = reinterpret_cast< float * >( gradInputWrapper->getHostArray() ); // cout << "gradInput:" << endl; // for( int i = 0; i < inputImageSize; i++ ) { // for( int j = 0; j < inputImageSize; j++ ) { //// cout << " " << gradInput[i * inputImageSize + j]; // if( gradInput[i * inputImageSize + j] != 0 ) { // cout << " *"; // } else { // cout << " ."; // } // } // cout << endl; // } if( weOwnErrorsWrapper ) { delete gradOutputWrapper; } }
void propagateWithWipe( Propagate *prop, int batchSize, LayerDimensions dim, float *inputData, float *filters, float *biases, float *results ) { int inputDataSize = batchSize * dim.inputCubeSize; CLWrapper *dataWrapper = prop->cl->wrap( inputDataSize, inputData ); dataWrapper->copyToDevice(); int weightsSize = dim.filtersSize; CLWrapper *weightsWrapper = prop->cl->wrap( weightsSize, filters ); weightsWrapper->copyToDevice(); CLWrapper *biasWeightsWrapper = 0; if( dim.biased ) { biasWeightsWrapper = prop->cl->wrap( dim.numFilters, biases ); biasWeightsWrapper->copyToDevice(); } CLWrapper *resultsWrapper = prop->cl->wrap( batchSize * dim.outputCubeSize, results ); memset( results, 99, sizeof(float) * batchSize * dim.outputCubeSize ); resultsWrapper->copyToDevice(); // so we can wipe it... StatefulTimer::timeCheck("testpropagate: after data wrapper processing"); prop->propagate( batchSize, dataWrapper, weightsWrapper, biasWeightsWrapper, resultsWrapper ); // StatefulTimer::timeCheck("Propagate::propagate after call propagate"); resultsWrapper->copyToHost(); // StatefulTimer::timeCheck("Propagate::propagate after copytohost"); delete resultsWrapper; delete dataWrapper; delete weightsWrapper; if( dim.biased ) { delete biasWeightsWrapper; } }
void forwardWithWipe( Forward *prop, int batchSize, LayerDimensions dim, float *inputData, float *filters, float *biases, float *output ) { int inputDataSize = batchSize * dim.inputCubeSize; CLWrapper *dataWrapper = prop->cl->wrap( inputDataSize, inputData ); dataWrapper->copyToDevice(); int weightsSize = dim.filtersSize; CLWrapper *weightsWrapper = prop->cl->wrap( weightsSize, filters ); weightsWrapper->copyToDevice(); CLWrapper *biasWrapper = 0; if( dim.biased ) { biasWrapper = prop->cl->wrap( dim.numFilters, biases ); biasWrapper->copyToDevice(); } CLWrapper *outputWrapper = prop->cl->wrap( batchSize * dim.outputCubeSize, output ); memset( output, 99, sizeof(float) * batchSize * dim.outputCubeSize ); outputWrapper->copyToDevice(); // so we can wipe it... StatefulTimer::timeCheck("testforward: after data wrapper processing"); prop->forward( batchSize, dataWrapper, weightsWrapper, biasWrapper, outputWrapper ); // StatefulTimer::timeCheck("Forward::forward after call forward"); outputWrapper->copyToHost(); // StatefulTimer::timeCheck("Forward::forward after copytohost"); delete outputWrapper; delete dataWrapper; delete weightsWrapper; if( dim.biased ) { delete biasWrapper; } }
TEST(testcopybuffer, main) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); float in[5]; float in2[5]; for(int i = 0; i < 5; i++) { in[i] = i * 3.0f; in2[i] = 23.0f + i; } float out[5]; CLWrapper *inwrapper = cl->wrap(5, in); CLWrapper *in2wrapper = cl->wrap(5, in2); CLWrapper *outwrapper = cl->wrap(5, out); inwrapper->copyToDevice(); in2wrapper->copyToDevice(); EXPECT_FALSE(in2wrapper->isDeviceDirty()); inwrapper->copyTo(in2wrapper); EXPECT_TRUE(in2wrapper->isDeviceDirty()); // cl->finish(); // check that in2 host-side unchanged: for(int i = 0; i < 5; i++) { in[i] = i * 3.0f; EXPECT_EQ(23.0f + i, in2[i]); } in2wrapper->copyToHost(); // check that in2 is now a copy of in: for(int i = 0; i < 5; i++) { in[i] = i * 3.0f; EXPECT_EQ(i * 3.0f, in2[i]); } // check that modifying in2 doesnt modfiy in: in2[1] = 27; in2wrapper->copyToDevice(); inwrapper->copyToHost(); EXPECT_EQ(1 * 3.0f, in[1]); in2wrapper->copyToHost(); EXPECT_EQ(1 * 3.0f, in[1]); EXPECT_EQ(27.0f, in2[1]); delete inwrapper; delete in2wrapper; delete outwrapper; delete cl; }
VIRTUAL void ActivationPropagate::propagate( int batchSize, float *input, float *output ) { // cout << "ActivationPropagate::propagate( float * )" << endl; CLWrapper *inputWrapper = cl->wrap( getInputSize( batchSize ), input ); CLWrapper *outputWrapper = cl->wrap( getResultsSize( batchSize ), output ); inputWrapper->copyToDevice(); propagate( batchSize, inputWrapper, outputWrapper ); outputWrapper->copyToHost(); delete outputWrapper; delete inputWrapper; }
TEST(SLOW_testcopybuffer, larger) { if(!EasyCL::isOpenCLAvailable()) { cout << "opencl library not found" << endl; exit(-1); } cout << "found opencl library" << endl; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test"); const int bufferSize = 100 * 1024 * 1024 / 4; // 100MB (of floats) float *in = new float[bufferSize]; float *in2 = new float[bufferSize]; for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; in2[i] = 23 + i; } CLWrapper *inwrapper = cl->wrap(bufferSize, in); CLWrapper *in2wrapper = cl->wrap(bufferSize, in2); inwrapper->copyToDevice(); in2wrapper->copyToDevice(); inwrapper->copyTo(in2wrapper); // cl->finish(); // check that in2 host-side unchanged: for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; EXPECT_EQ(23 + i, in2[i]); } in2wrapper->copyToHost(); // check that in2 is now a copy of in: for(int i = 0; i < bufferSize; i++) { in[i] = i * 3; EXPECT_EQ(i * 3, in2[i]); } // check that modifying in2 doesnt modfiy in: in2[1] = 27; in2wrapper->copyToDevice(); inwrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); in2wrapper->copyToHost(); EXPECT_EQ(1 * 3, in[1]); EXPECT_EQ(27, in2[1]); delete inwrapper; delete in2wrapper; delete[] in; delete[] in2; delete cl; }
VIRTUAL void ActivationLayer::forward() { CLWrapper *inputWrapper = 0; if( previousLayer->hasOutputWrapper() ) { inputWrapper = previousLayer->getOutputWrapper(); } else { float *input = previousLayer->getOutput(); inputWrapper = cl->wrap( previousLayer->getOutputSize(), input ); inputWrapper->copyToDevice(); } activationForwardImpl->forward( batchSize, inputWrapper, outputWrapper ); // outputCopiedToHost = false; if( !previousLayer->hasOutputWrapper() ) { delete inputWrapper; } }
TEST( testMemset, basic ) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); CLKernel *kMemset = 0; // [[[cog // import stringify // stringify.write_kernel2( "kMemset", "cl/memset.cl", "cl_memset", '""' ) // ]]] // generated using cog, from cl/memset.cl: const char * kMemsetSource = "// Copyright Hugh Perkins 2015 hughperkins at gmail\n" "//\n" "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" "// obtain one at http://mozilla.org/MPL/2.0/.\n" "\n" "kernel void cl_memset(global float *target, const float value, const int N) {\n" " #define globalId get_global_id(0)\n" " if ((int)globalId < N) {\n" " target[globalId] = value;\n" " }\n" "}\n" "\n" ""; kMemset = cl->buildKernelFromString(kMemsetSource, "cl_memset", "", "cl/memset.cl"); // [[[end]]] int N = 10000; float *myArray = new float[N]; CLWrapper *myArrayWrapper = cl->wrap( N, myArray ); myArrayWrapper->createOnDevice(); kMemset->out( myArrayWrapper )->in( 99.0f )->in( N ); int workgroupSize = 64; kMemset->run_1d( ( N + workgroupSize - 1 ) / workgroupSize * workgroupSize, workgroupSize ); cl->finish(); myArrayWrapper->copyToHost(); for( int i = 0; i < 10; i++ ) { // cout << "myArray[" << i << "]=" << myArray[i] << endl; } for( int i = 0; i < N; i++ ) { EXPECT_EQ( 99.0f, myArray[i] ); } delete kMemset; delete cl; }
VIRTUAL void DropoutLayer::backward() { // have no weights to backprop to, just need to backprop the errors CLWrapper *gradOutputWrapper = 0; bool weOwnErrorsWrapper = false; if(nextLayer->providesGradInputWrapper()) { gradOutputWrapper = nextLayer->getGradInputWrapper(); } else { gradOutputWrapper = cl->wrap(getOutputNumElements(), nextLayer->getGradInput()); gradOutputWrapper->copyToDevice(); weOwnErrorsWrapper = true; } maskWrapper->copyToDevice(); dropoutBackwardImpl->backward(batchSize, maskWrapper, gradOutputWrapper, gradInputWrapper); if(weOwnErrorsWrapper) { delete gradOutputWrapper; } }
VIRTUAL void BackpropWeights2::backpropWeights( int batchSize, float learningRate, float *derivLossBySum, float *inputData, float *filters, float *biasWeights ) { StatefulTimer::timeCheck("BackpropWeights2::backprop begin"); // const float learningMultiplier = learningRate / batchSize / sqrt( dim.outputImageSize * dim.outputImageSize ); int resultsSize = batchSize * dim.outputCubeSize; CLWrapper *derivLossBySumWrapper = cl->wrap( resultsSize, derivLossBySum ); derivLossBySumWrapper->copyToDevice(); int inputSize = batchSize * dim.inputCubeSize; CLWrapper *inputDataWrapper = cl->wrap( inputSize, inputData ); inputDataWrapper->copyToDevice(); CLWrapper *weightsWrapper = 0; int weightsSize = debug ? std::max(10000, dim.filtersSize ) : dim.filtersSize; weightsWrapper = cl->wrap( weightsSize, filters ); weightsWrapper->copyToDevice(); // cout << "backpropweights2::backpropweights resultsSize=" << resultsSize << " inputSize=" << inputSize << // " weightSize=" << weightsSize << endl; CLWrapper *biasWeightsWrapper = 0; if( dim.biased ) { biasWeightsWrapper = cl->wrap( dim.numFilters, biasWeights ); biasWeightsWrapper->copyToDevice(); } StatefulTimer::timeCheck("BackpropWeights2::backprop after copied to device"); backpropWeights( batchSize, learningRate, derivLossBySumWrapper, inputDataWrapper, weightsWrapper, biasWeightsWrapper ); StatefulTimer::timeCheck("BackpropWeights2::backprop after call backprop"); weightsWrapper->copyToHost(); if( dim.biased ) { biasWeightsWrapper->copyToHost(); } StatefulTimer::timeCheck("BackpropWeights2::backprop after copytohost"); delete derivLossBySumWrapper; delete inputDataWrapper; delete weightsWrapper; if( dim.biased ) { delete biasWeightsWrapper; } }
VIRTUAL void Adadelta::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper, AdadeltaState *trainerState ) { // need to calculate // sumGradSquared = decay * sumGradSquared + (1 - decay ) * grad.square() // update = - sumUpdateSquared.sqrt() / sumGradSquared.sqrt() * grad // sumUpdateSquared = decay * sumUpdateSquared + ( 1 - decay ) * update.squared() // weights += update int numWeights = trainerState->numWeights; float *working = new float[ numWeights ]; CLWrapper *workingWrapper = cl->wrap( numWeights, working ); workingWrapper->createOnDevice(); CLMathWrapper clWeights( weightsWrapper ); CLMathWrapper clGradWeights( gradWeightsWrapper ); CLMathWrapper clSumGradSquared( trainerState->sumGradSquaredWrapper ); CLMathWrapper clSumUpdateSquared( trainerState->sumUpdateSquaredWrapper ); CLMathWrapper clWorking( workingWrapper ); // following all happens on gpu, via clmathwrapper: clWorking = clGradWeights; clWorking.squared(); clWorking *= ( 1 - decay ); clSumGradSquared *= decay; clSumGradSquared += clWorking; clWorking = clSumGradSquared; clWorking.inv(); clWorking *= clSumUpdateSquared; clWorking.sqrt(); clWorking *= clGradWeights; clWorking *= - 1; clWeights += clWorking; clSumUpdateSquared *= decay; clWorking.squared(); clWorking *= ( 1 - decay ); clSumUpdateSquared += clWorking; delete workingWrapper; delete[] working; }
Scan::Scan(CLWrapper &clw, size_t wx) : clw(clw), wx(wx), c0(0), c1(0), m0(0), m1(0), k0(0), k1(0), k2(0) { m = wx * 2; #if EMBED_CL #include "scan.cl.h" clw.create_all_kernels(clw.compile_from_string((char *)&scan_cl)); #else clw.create_all_kernels(clw.compile("scan.cl")); #endif scan_pow2 = clw.kernel_of_name("scan_pow2_wrapper"); scan_pad_to_pow2 = clw.kernel_of_name("scan_pad_to_pow2"); scan_subarrays = clw.kernel_of_name("scan_subarrays"); scan_inc_subarrays = clw.kernel_of_name("scan_inc_subarrays"); }
VIRTUAL void ConvolutionalLayer::propagate() { if( batchSize == 0 ) { throw runtime_error("Need to call setBatchSize(size) before calling propagate etc"); } // if( imageSizeSquared <= cl->getMaxWorkgroupSize() ) { //// propagate2(); // } else { // // propagate1(); // } // propagate1(); StatefulTimer::instance()->timeCheck(" propagate layer " + toString( layerIndex ) + ", START"); CLWrapper *upstreamWrapper = 0; if( previousLayer->hasResultsWrapper() ) { // std::cout << "layer " << previousLayer->layerIndex << " has resultsWrapper" << std::endl; upstreamWrapper = previousLayer->getResultsWrapper(); } else { // std::cout << "layer " << previousLayer->layerIndex << " has no resultsWrapper" << std::endl; upstreamWrapper = cl->wrap( previousLayer->getResultsSize(), (float *)previousLayer->getResults() ); upstreamWrapper->copyToDevice(); } CLFloatWrapper *biasWeightsWrapper = 0; if( dim.biased ) { biasWeightsWrapper = cl->wrap( getBiasWeightsSize(), biasWeights ); biasWeightsWrapper->copyToDevice(); } StatefulTimer::instance()->timeCheck(" propagate layer " + toString( layerIndex ) + ", copied to device"); propagateimpl->propagate( batchSize, upstreamWrapper, weightsWrapper, biasWeightsWrapper, resultsWrapper ); StatefulTimer::instance()->timeCheck(" propagate layer " + toString( layerIndex ) + ", after clFinish"); if( !previousLayer->hasResultsWrapper() ) { delete upstreamWrapper; } if( dim.biased ) { delete biasWeightsWrapper; } resultsCopiedToHost = false; }
void measurePerf(int instance, int batchSize, LayerDimensions dim) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); int inputNumElements = dim.inputCubeSize * batchSize; int errorsSize = dim.outputCubeSize * batchSize; int weightsSize = dim.filtersSize; int errorsForUpstreamSize = dim.inputCubeSize * batchSize; float *input = new float[inputNumElements]; float *errors = new float[errorsSize]; float *weights = new float[weightsSize]; WeightRandomizer::randomize(input, inputNumElements, -0.1f, 0.1f); WeightRandomizer::randomize(errors, errorsSize, -0.1f, 0.1f); WeightRandomizer::randomize(weights, weightsSize, -0.1f, 0.1f); float *errorsForUpstream = new float[errorsForUpstreamSize]; CLWrapper *inputWrapper = cl->wrap(inputNumElements, input); CLWrapper *errorsWrapper = cl->wrap(errorsSize, errors); CLWrapper *weightsWrapper = cl->wrap(weightsSize, weights); CLWrapper *errorsForUpstreamWrapper = cl->wrap(errorsForUpstreamSize, errorsForUpstream); inputWrapper->copyToDevice(); errorsWrapper->copyToDevice(); weightsWrapper->copyToDevice(); errorsForUpstreamWrapper->createOnDevice(); StatefulTimer::timeCheck("after init"); Backward *backwardImpl = Backward::instanceSpecific(instance, cl, dim); for(int it = 0; it < 40; it++) { backwardImpl->backward(batchSize, inputWrapper, errorsWrapper, weightsWrapper, errorsForUpstreamWrapper); } StatefulTimer::timeCheck("after backprop"); StatefulTimer::dump(true); delete errorsForUpstreamWrapper; delete weightsWrapper; delete inputWrapper; delete errorsWrapper; delete[] errors; delete[] weights; delete[] input; delete[] errorsForUpstream; delete backwardImpl; delete cl; }
VIRTUAL void ConvolutionalLayer::backProp( float learningRate ) { // Timer timer; StatefulTimer::instance()->timeCheck("backprop(): start, layer " + toString( layerIndex ) ); CLWrapper *biasWeightsWrapper = 0; if( dim.biased ) { biasWeightsWrapper = cl->wrap( getBiasWeightsSize(), biasWeights ); biasWeightsWrapper->copyToDevice(); } CLWrapper *imagesWrapper = 0; if( previousLayer->hasResultsWrapper() ) { imagesWrapper = previousLayer->getResultsWrapper(); } else { imagesWrapper = cl->wrap( previousLayer->getResultsSize(), previousLayer->getResults() ); imagesWrapper->copyToDevice(); } CLWrapper *errorsWrapper = 0; bool weOwnErrorsWrapper = false; if( nextLayer->providesErrorsForUpstreamWrapper() ) { errorsWrapper = nextLayer->getErrorsForUpstreamWrapper(); } else { errorsWrapper = cl->wrap( getResultsSize(), nextLayer->getErrorsForUpstream() ); errorsWrapper->copyToDevice(); // int resultsSize = getResultsSize(); // for( int i = 0; i < resultsSize; i++ ) { // cout << "convolutional::backproperrors errorsfromupstream[" << i << "]=" << nextLayer->getErrorsForUpstream()[i] << endl; // } weOwnErrorsWrapper = true; } if( previousLayer->needsBackProp() ) { backpropErrorsImpl->backpropErrors( batchSize, imagesWrapper, errorsWrapper, weightsWrapper, errorsForUpstreamWrapper ); StatefulTimer::instance()->timeCheck("backproperrors(): calced errors for upstream, layer " + ::toString( layerIndex ) ); } backpropWeightsImpl->backpropWeights( batchSize, learningRate, errorsWrapper, imagesWrapper, weightsWrapper, biasWeightsWrapper ); weightsCopiedToHost = false; StatefulTimer::instance()->timeCheck("backproperrors(): done weight backprop, layer " + ::toString( layerIndex ) ); if( dim.biased ) { biasWeightsWrapper->copyToHost(); delete biasWeightsWrapper; } if( !previousLayer->hasResultsWrapper() ) { delete imagesWrapper; } if( weOwnErrorsWrapper ) { delete errorsWrapper; } StatefulTimer::instance()->timeCheck("backproperrors(): updated weights, layer " + ::toString( layerIndex ) ); }
TEST(testCopyBuffer, ints) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); const int N = 10; int *a = new int[N]; for(int i = 0; i < N; i++) { a[i] = 3 + i; } CLWrapper *aWrapper = cl->wrap(N, a); aWrapper->copyToDevice(); memset(a, 0, sizeof(int) * N); int *b = new int[N]; CopyBuffer::copy(cl, aWrapper, b); for(int i = 0; i < N; i++) { // cout << b[i] << endl; EXPECT_EQ(3 + i, b[i]); } memset(b, 0, sizeof(int) * N); CopyBuffer::copy(cl, aWrapper, b); for(int i = 0; i < N; i++) { // cout << b[i] << endl; EXPECT_EQ(3 + i, b[i]); } PrintBuffer::printInts(cl, aWrapper, 10, 1); delete[] b; delete aWrapper; delete[] a; delete cl; }
VIRTUAL void DropoutLayer::forward() { CLWrapper *upstreamOutputWrapper = 0; if(previousLayer->hasOutputWrapper()) { upstreamOutputWrapper = previousLayer->getOutputWrapper(); } else { float *upstreamOutput = previousLayer->getOutput(); upstreamOutputWrapper = cl->wrap(previousLayer->getOutputNumElements(), upstreamOutput); upstreamOutputWrapper->copyToDevice(); } // cout << "training: " << training << endl; if(training) { // create new masks... generateMasks(); maskWrapper->copyToDevice(); dropoutForwardImpl->forward(batchSize, maskWrapper, upstreamOutputWrapper, outputWrapper); } else { // if not training, then simply skip the dropout bit, copy the buffers directly multiplyBuffer->multiply(getOutputNumElements(), dropRatio, upstreamOutputWrapper, outputWrapper); } if(!previousLayer->hasOutputWrapper()) { delete upstreamOutputWrapper; } }
TEST( testCopyBuffer, floats ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); const int N = 10; float *a = new float[N]; for( int i = 0; i < N; i++ ) { a[i] = 3 + i; } CLWrapper *aWrapper = cl->wrap( N, a ); aWrapper->copyToDevice(); memset( a, 0, sizeof(float) * N ); float *b = new float[N]; CopyBuffer::copy( cl, aWrapper, b ); for( int i = 0; i < N; i++ ) { cout << b[i] << endl; EXPECT_EQ( 3 + i, b[i] ); } memset( b, 0, sizeof(float) * N ); CopyBuffer::copy( cl, aWrapper, b ); for( int i = 0; i < N; i++ ) { cout << b[i] << endl; EXPECT_EQ( 3 + i, b[i] ); } PrintBuffer::printFloats( cl, aWrapper, 10, 1 ); delete[] b; delete aWrapper; delete[] a; delete cl; }
// must allocate output yourself before the call VIRTUAL void Forward::forward( int batchSize, float *inputData, float *filters, float *biases, float *output ) { StatefulTimer::timeCheck("Forward::forward begin"); int inputDataSize = batchSize * dim.inputCubeSize; CLWrapper *dataWrapper = cl->wrap( inputDataSize, inputData ); dataWrapper->copyToDevice(); int weightsSize = dim.filtersSize; CLWrapper *weightsWrapper = cl->wrap( weightsSize, filters ); weightsWrapper->copyToDevice(); CLWrapper *biasWrapper = 0; if( dim.biased ) { int biasWrapperSize = dim.numFilters; biasWrapper = cl->wrap( biasWrapperSize, biases ); biasWrapper->copyToDevice(); } // int outputDataSize = batchSize * dim.outputCubeSize; // cout << " batchsize " << batchSize << " " << dim << endl; // int allocatedOutputSize = std::max(5000, outputDataSize ); // int allocatedOutputSize = outputDataSize; // float *output = new float[allocatedOutputSize]; CLWrapper *outputWrapper = cl->wrap( batchSize * dim.outputCubeSize, output ); cl->finish(); StatefulTimer::timeCheck("Forward::forward after copied to device"); forward( batchSize, dataWrapper, weightsWrapper, biasWrapper, outputWrapper ); StatefulTimer::timeCheck("Forward::forward after call forward"); outputWrapper->copyToHost(); StatefulTimer::timeCheck("Forward::forward after copytohost"); // for( int i = 0; i < 20; i++ ) { // cout << "output[" << i << "]=" << output[i] << endl; // } delete outputWrapper; delete dataWrapper; delete weightsWrapper; if( dim.biased ) { delete biasWrapper; } // return output; }
TEST(testClBlas, colMajor2) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); float A[] = {1, 3, 2, 7, 9, 5, 0, -2}; float B[] = {3,2,8, -1,0,4}; float C[4*3]; transpose(A, 4, 2); transpose(B, 2, 3); // for(int row=0; row < 2; row++) { // for(int col=0; col < 1; col++) { // cout << B[row*1 + col] << " "; // } // cout << endl; // } ClBlasInstance clblasInstance; // ClBlasInstance::initializeIfNecessary(); CLWrapper *AWrap = cl->wrap(4*2, A); CLWrapper *BWrap = cl->wrap(2*3, B); CLWrapper *CWrap = cl->wrap(4*3, C); AWrap->copyToDevice(); BWrap->copyToDevice(); ClBlasHelper::Gemm( cl, clblasColumnMajor, clblasNoTrans, clblasNoTrans, 4, 2, 3, 1, AWrap, 0, BWrap, 0, 0, CWrap, 0 ); // cl->finish(); CWrap->copyToHost(); transpose(C, 3, 4); EXPECT_EQ(1*3-1*3, C[0]); EXPECT_EQ(1*2+3*0, C[1]); EXPECT_EQ(1*8+4*3, C[2]); EXPECT_EQ(-8, C[11]); delete CWrap; delete BWrap; delete AWrap; delete cl; }