TEST( testforward, compare_1_n_biased_pad ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); int maxWorkgroupSize = cl->getMaxWorkgroupSize(); delete cl; LayerDimensions dim; int batchSize = 4; int N = 4; string activationName = "tanh"; dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 ) .setFilterSize( 5 ) .setPadZeros( true ).setBiased( true ); for( int instance = 2; instance <= 7; instance++ ) { if( instance == 5 ) { continue; // forwardfc, cant use for inputimagesize != filtersize } dim.setInputSize(19); if(instance == 2 && maxWorkgroupSize < 19 * 19) { dim.setInputSize(15); } if(instance == 3 && maxWorkgroupSize < 19 * 19) { dim.setInputSize(15); } cout << "instance: " << instance << endl; compareSpecific( false, N, batchSize, dim, 1, instance ); } }
TEST( SLOW_testpropagate, perf_kgsgo_fc500 ) { int batchSize = 128; LayerDimensions dim; dim.setInputPlanes( 32 ).setInputImageSize(19).setNumFilters( 500 ).setFilterSize( 19 ) .setPadZeros( false ).setBiased( true ); testPerf( -1, 128, batchSize, dim, new TanhActivation() ); }
TEST( SLOW_testforward, perf_mnist_intlayers_1024ex ) { int batchSize = 1024; LayerDimensions dim; dim.setInputPlanes( 32 ).setInputSize(28).setNumFilters( 32 ).setFilterSize( 5 ) .setPadZeros( true ).setBiased( true ); testPerf( -1, 128, batchSize, dim ); }
TEST( SLOW_testforward, perf_mnist_finallayer ) { int batchSize = 128; LayerDimensions dim; dim.setInputPlanes( 32 ).setInputSize(28).setNumFilters( 10 ).setFilterSize( 28 ) .setPadZeros( false ).setBiased( true ); testPerf( -1, 128, batchSize, dim ); }
TEST( SLOW_testpropagate, perf_mnist_finallayer ) { int batchSize = 128; LayerDimensions dim; dim.setInputPlanes( 32 ).setInputImageSize(28).setNumFilters( 10 ).setFilterSize( 28 ) .setPadZeros( false ).setBiased( true ); testPerf( -1, 128, batchSize, dim, new ReluActivation() ); }
TEST( SLOW_testpropagate, compare_args ) { LayerDimensions dim; int batchSize = 128; // int imageSize = 19; // int filterSize = 7; // int inputPlanes = 64; // int numFilters = 64; int instance0 = 1; int instance1 = 3; int N = 128; bool debug = false; string activationName = "tanh"; dim.setInputPlanes( 64 ).setInputImageSize(19).setNumFilters( 64 ) .setFilterSize( 7 ) .setPadZeros( true ).setBiased( false ); TestArgsParser::arg( "n", &N ); DimFromArgs::arg( &dim ); TestArgsParser::arg( "instance0", &instance0 ); TestArgsParser::arg( "instance1", &instance1 ); TestArgsParser::arg( "debug", &debug ); TestArgsParser::arg( "batchsize", &batchSize ); TestArgsParser::arg( "activation", &activationName ); TestArgsParser::go(); dim.deriveOthers(); ActivationFunction *fn = ActivationFunction::fromName( activationName ); compareSpecific( debug, N, batchSize, dim, fn, instance0, instance1 ); }
TEST( SLOW_testforward, perf_kgsgo_fc500 ) { int batchSize = 128; LayerDimensions dim; dim.setInputPlanes( 32 ).setInputSize(19).setNumFilters( 500 ).setFilterSize( 19 ) .setPadZeros( false ).setBiased( true ); testPerf( -1, 128, batchSize, dim ); }
TEST( testforward, compare_1_4_fcscenario ) { // only need to do nopad, since fc wont work with pad LayerDimensions dim; int batchSize = 4; int N = 4; dim.setInputPlanes( 10 ).setInputSize(24).setNumFilters( 10 ) .setFilterSize( 24 ) .setPadZeros( false ).setBiased( true ); compareSpecific( false, N, batchSize, dim, 1, 4 ); }
TEST(SLOW_testbackward, perf_kgsgo_32c5) { int batchSize = 128; LayerDimensions dim; dim.setInputPlanes(32).setInputSize(19).setNumFilters(32).setFilterSize(5) .setPadZeros(true).setBiased(true); cout << dim.buildOptionsString() << endl; // ActivationFunction *fn = new ReluActivation(); measurePerf(2, batchSize, dim); }
TEST( testforward, compare_1_5_biased_nopad ) { // only need to do nopad, since fc wont work with pad LayerDimensions dim; int batchSize = 4; // int instance0 = 1; // int instance1 = 1; int N = 4; dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 ) .setFilterSize( 19 ) .setPadZeros( false ).setBiased( true ); compareSpecific( false, N, batchSize, dim, 1, 5 ); }
TEST(SLOW_testbackward, compare_kgsgo_32c5mini) { int batchSize = 4; LayerDimensions dim; dim.setInputPlanes(2).setInputSize(3).setNumFilters(2).setFilterSize(3) .setPadZeros(true).setBiased(true); cout << dim.buildOptionsString() << endl; // ActivationFunction *fn = new ReluActivation(); compareSpecific(1, 2, 1, batchSize, dim); }
TEST( testpropagate, compare_1_4_fcscenario ) { // only need to do nopad, since fc wont work with pad LayerDimensions dim; int batchSize = 4; int N = 4; string activationName = "tanh"; dim.setInputPlanes( 10 ).setInputImageSize(24).setNumFilters( 10 ) .setFilterSize( 24 ) .setPadZeros( false ).setBiased( true ); ActivationFunction *fn = ActivationFunction::fromName( activationName ); compareSpecific( false, N, batchSize, dim, fn, 1, 4 ); }
TEST( testforward, compare_0_1_biased_pad ) { LayerDimensions dim; int batchSize = 4; // int instance0 = 1; // int instance1 = 1; int N = 4; string activationName = "tanh"; dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 ) .setFilterSize( 5 ) .setPadZeros( true ).setBiased( true ); compareSpecific( false, N, batchSize, dim, 0, 1 ); }
TEST( SLOW_testforward, soumith2 ) { int batchSize = 128; LayerDimensions dim; int instance = 4; bool biased = true; TestArgsParser::arg( "instance", &instance ); TestArgsParser::arg( "biased", &biased ); TestArgsParser::go(); dim.setInputPlanes( 64 ).setInputSize( 64 ).setNumFilters( 128 ).setFilterSize( 9 ) .setPadZeros( false ).setBiased( biased ); testPerf( instance, 128, batchSize, dim ); }
TEST( testpropagate, compare_1_5_biased_nopad ) { // only need to do nopad, since fc wont work with pad LayerDimensions dim; int batchSize = 4; // int instance0 = 1; // int instance1 = 1; int N = 4; string activationName = "tanh"; dim.setInputPlanes( 8 ).setInputImageSize(19).setNumFilters( 8 ) .setFilterSize( 19 ) .setPadZeros( false ).setBiased( true ); ActivationFunction *fn = ActivationFunction::fromName( activationName ); compareSpecific( false, N, batchSize, dim, fn, 1, 5 ); }
TEST(testbackward, compare_1_n_kgsgo_32c5) { int batchSize = 8; LayerDimensions dim; dim.setInputPlanes(32).setInputSize(19).setNumFilters(32).setFilterSize(5) .setPadZeros(true).setBiased(true); cout << dim.buildOptionsString() << endl; // ActivationFunction *fn = new ReluActivation(); compareSpecific(0, 1, 1, batchSize, dim); for(int i=2; i < Backward::getNumImplementations(); i++) { compareSpecific(1, i, 1, batchSize, dim); } }
TEST( testpropagate, compare_0_1_biased_pad ) { LayerDimensions dim; int batchSize = 4; // int instance0 = 1; // int instance1 = 1; int N = 4; string activationName = "tanh"; dim.setInputPlanes( 8 ).setInputImageSize(19).setNumFilters( 8 ) .setFilterSize( 5 ) .setPadZeros( true ).setBiased( true ); ActivationFunction *fn = ActivationFunction::fromName( activationName ); compareSpecific( false, N, batchSize, dim, fn, 0, 1 ); }
TEST( SLOW_testforward, perf_kgsgo_64c7_args ) { int instance = 3; int batchSize = 128; int N = 1000; LayerDimensions dim; dim.setInputPlanes( 64 ).setInputSize(19).setNumFilters( 64 ).setFilterSize( 7 ) .setPadZeros( true ).setBiased( true ); DimFromArgs::arg( &dim ); TestArgsParser::arg( "instance", &instance ); TestArgsParser::arg( "n", &N ); TestArgsParser::arg( "batchsize", &batchSize ); TestArgsParser::go(); testPerf( instance, N, batchSize, dim ); }
TEST( testforward, crash_from_jm ) { int instance = 1; int batchSize = 64; int N = 64; LayerDimensions dim; dim.setInputPlanes( 32 ).setInputSize(28).setNumFilters( 20 ).setFilterSize( 28 ) .setPadZeros( false ).setBiased( false ); DimFromArgs::arg( &dim ); TestArgsParser::arg( "instance", &instance ); TestArgsParser::arg( "n", &N ); TestArgsParser::arg( "batchsize", &batchSize ); TestArgsParser::go(); testPerf( instance, N, batchSize, dim ); }
TEST( testforward, compare_1_n_biased_nopad ) { LayerDimensions dim; int batchSize = 4; // int instance0 = 1; // int instance1 = 1; int N = 4; string activationName = "tanh"; dim.setInputPlanes( 8 ).setInputSize(19).setNumFilters( 8 ) .setFilterSize( 5 ) .setPadZeros( false ).setBiased( true ); for( int instance = 2; instance <= 7; instance++ ) { if( instance == 5 ) { continue; // forwardfc, cant use for inputimagesize != filtersize } cout << "instance: " << instance << endl; compareSpecific( false, N, batchSize, dim, 1, instance ); } }
TEST( testforward, test2 ) { int batchSize = 2; LayerDimensions dim; dim.setNumFilters(2).setNumInputPlanes(1).setInputSize(3).setFilterSize(3) .setPadZeros(false).setBiased(false); float data[] = { 0, 0, 0, -0.5f, 0.5f, 0, 0, 0, 0, 0, 0, 0, 0.5f, -0.5f, 0, 0, 0, 0 }; float filter1[] = { 0, 0, 0, 0.300809f, -0.11011f, 0, 0, 0, 0, 0, 0, 0, 0.0570846f, 0.347077f, 0, 0,0,0 }; EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); float *biases = 0; Forward *forward = Forward::instanceSpecific( 1, cl, dim ); float *output = new float[forward->getOutputTotalSize(batchSize)]; forward->forward( batchSize, data, filter1, biases, output ); EXPECT_FLOAT_NEAR( -0.5f * 0.300809f -0.5f * 0.11011f, output[0] ); EXPECT_FLOAT_NEAR( -0.5f * 0.0570846f +0.5f * 0.347077f, output[1] ); EXPECT_FLOAT_NEAR( 0.5f * 0.300809f +0.5f * 0.11011f, output[2] ); EXPECT_FLOAT_NEAR( 0.5f * 0.0570846f -0.5f * 0.347077f, output[3] ); delete[] output; delete forward; delete cl; }
BackpropWeightsScratchBias::BackpropWeightsScratchBias( OpenCLHelper *cl, LayerDimensions dim, ActivationFunction const *fn ) : BackpropWeights( cl, dim, fn ) { // [[[cog // import stringify // # stringify.write_kernel( "kernelSource", "ClConvolve.cl") // ]]] // [[[end]]] std::string options = dim.buildOptionsString(); options += " -D " + fn->getDefineName(); kernel = cl->buildKernel( "backpropweights.cl", "backprop_floats_withscratch_dobias", options ); // kernel = cl->buildKernelFromString( kernelSource, "calcErrorsForUpstream", options ); }
TEST( testforward, comparespecific_break2 ) { // this breaks on v5.7.0 for example LayerDimensions dim; int batchSize = 4; int instance0 = 1; int instance1 = 5; int N = 4; bool debug = false; dim.setInputPlanes( 64 ).setInputSize(19).setNumFilters( 64 ) .setFilterSize( 19 ) .setPadZeros( false ).setBiased( false ); TestArgsParser::arg( "n", &N ); DimFromArgs::arg( &dim ); TestArgsParser::arg( "instance0", &instance0 ); TestArgsParser::arg( "instance1", &instance1 ); TestArgsParser::arg( "debug", &debug ); TestArgsParser::arg( "batchsize", &batchSize ); TestArgsParser::go(); dim.deriveOthers(); compareSpecific( debug, N, batchSize, dim, instance0, instance1 ); }
TEST( SLOW_testforward, compare_args ) { LayerDimensions dim; int batchSize = 128; int instance0 = 1; int instance1 = 3; int N = 128; bool debug = false; dim.setInputPlanes( 64 ).setInputSize(19).setNumFilters( 64 ) .setFilterSize( 7 ) .setPadZeros( true ).setBiased( false ); TestArgsParser::arg( "n", &N ); DimFromArgs::arg( &dim ); TestArgsParser::arg( "instance0", &instance0 ); TestArgsParser::arg( "instance1", &instance1 ); TestArgsParser::arg( "debug", &debug ); TestArgsParser::arg( "batchsize", &batchSize ); TestArgsParser::go(); dim.deriveOthers(); compareSpecific( debug, N, batchSize, dim, instance0, instance1 ); }
void testPerf( int instance, int N, int batchSize, LayerDimensions dim ) { cout << dim.buildOptionsString() << endl; int inputsSize = batchSize * dim.inputCubeSize; int filtersSize = dim.filtersSize; int biasSize = dim.numFilters; int inputsAllocated = std::max( inputsSize, 10000 ); int filtersAllocated = std::max( filtersSize, 10000 ); int biasFiltersAllocated = std::max( biasSize, 10000 ); float *inputs = new float[ inputsAllocated ]; float *filters = new float[ filtersAllocated ]; float *biasFilters = new float[ biasFiltersAllocated ]; memset( inputs, 0, sizeof(float) * inputsAllocated ); memset( filters, 0, sizeof(float) * filtersAllocated ); memset( biasFilters, 0, sizeof(float) * biasFiltersAllocated ); WeightRandomizer::randomize( inputs, inputsAllocated, -0.1f, 0.1f ); WeightRandomizer::randomize( filters, filtersAllocated, -0.1f, 0.1f ); WeightRandomizer::randomize( biasFilters, biasFiltersAllocated, -0.1f, 0.1f ); EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); Forward *p1 = Forward::instanceSpecific( instance, cl, dim ); for( int it = 0; it < (N + batchSize - 1 ) / batchSize; it++ ) { int thisBatchSize = it < N - 1 ? batchSize : N - batchSize * it; float *output1 = new float[p1->getOutputTotalSize(thisBatchSize)]; p1->forward( thisBatchSize, inputs, filters, biasFilters, output1 ); delete[] output1; } StatefulTimer::dump(true); delete p1; delete cl; delete[] inputs; delete[] filters; delete[] biasFilters; }
void testPerf( int instance, int N, int batchSize, LayerDimensions dim, ActivationFunction *fn ) { cout << dim.buildOptionsString() << endl; int inputsSize = batchSize * dim.inputCubeSize; int filtersSize = dim.filtersSize; int biasSize = dim.numFilters; int inputsAllocated = std::max( inputsSize, 10000 ); int filtersAllocated = std::max( filtersSize, 10000 ); int biasFiltersAllocated = std::max( biasSize, 10000 ); float *inputs = new float[ inputsAllocated ]; float *filters = new float[ filtersAllocated ]; float *biasFilters = new float[ biasFiltersAllocated ]; memset( inputs, 0, sizeof(float) * inputsAllocated ); memset( filters, 0, sizeof(float) * filtersAllocated ); memset( biasFilters, 0, sizeof(float) * biasFiltersAllocated ); WeightRandomizer::randomize( inputs, inputsAllocated, -0.1f, 0.1f ); WeightRandomizer::randomize( filters, filtersAllocated, -0.1f, 0.1f ); WeightRandomizer::randomize( biasFilters, biasFiltersAllocated, -0.1f, 0.1f ); OpenCLHelper *cl = OpenCLHelper::createForFirstGpuOtherwiseCpu(); Propagate *p1 = Propagate::instanceSpecific( instance, cl, dim, fn ); for( int it = 0; it < (N + batchSize - 1 ) / batchSize; it++ ) { int thisBatchSize = it < N - 1 ? batchSize : N - batchSize * it; float *results1 = p1->propagate( thisBatchSize, inputs, filters, biasFilters ); delete[] results1; } StatefulTimer::dump(true); delete p1; delete cl; delete[] inputs; delete[] filters; delete[] biasFilters; }
BackwardGpuNaive::BackwardGpuNaive( EasyCL *cl, LayerDimensions dim ) : Backward( cl, dim ) { std::string options = dim.buildOptionsString(); options += ""; // " -D " + upstreamFn->getDefineName(); // [[[cog // import stringify // stringify.write_kernel2( "kernel", "cl/backward.cl", "calcGradInput", 'options' ) // # stringify.write_kernel2( "broadcastMultiply", "cl/backproperrorsv2.cl", "broadcast_multiply", 'options' ) // # stringify.write_kernel2( "applyActivationDeriv", "cl/applyActivationDeriv.cl", "applyActivationDeriv", 'options' ) // # stringify.write_kernel( "kernelSource", "ClConvolve.cl") // ]]] // generated using cog, from cl/backward.cl: const char * kernelSource = "// Copyright Hugh Perkins 2014 hughperkins at gmail\n" "//\n" "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" "// obtain one at http://mozilla.org/MPL/2.0/.\n" "\n" "// expected defines:\n" "// - none\n" "\n" "// globalid as: [n][upstreamPlane][upstreamrow][upstreamcol]\n" "// inputdata: [n][upstreamPlane][upstreamrow][upstreamcol] 128 * 32 * 19 * 19 * 4 = 6MB\n" "// gradOutput: [n][outPlane][outRow][outCol] 128 * 32 * 19 * 19 * 4 = 6MB\n" "// weights: [filterId][inputPlane][filterRow][filterCol] 32 * 32 * 5 * 5 * 4 = 409KB\n" "void kernel calcGradInput(\n" " const int batchSize,\n" " global const float *gradOutput, global float *weights, global float *gradInput ) {\n" " int globalId = get_global_id(0);\n" "\n" " const int upstreamImage2dId = globalId / gInputImageSizeSquared;\n" "\n" " const int intraImageOffset = globalId % gInputImageSizeSquared;\n" " const int upstreamRow = intraImageOffset / gInputImageSize;\n" " const int upstreamCol = intraImageOffset % gInputImageSize;\n" "\n" " const int upstreamPlane = upstreamImage2dId % gInputPlanes;\n" " const int n = upstreamImage2dId / gInputPlanes;\n" "\n" " if( n >= batchSize ) {\n" " return;\n" " }\n" "\n" " const int minFilterRow = max( 0, upstreamRow + gMargin - (gOutputImageSize - 1) );\n" " const int maxFilterRow = min( gFilterSize - 1, upstreamRow + gMargin );\n" " const int minFilterCol = max( 0, upstreamCol + gMargin - (gOutputImageSize -1) );\n" " const int maxFilterCol = min( gFilterSize - 1, upstreamCol + gMargin );\n" "\n" " float sumWeightTimesOutError = 0;\n" " // aggregate over [outPlane][outRow][outCol]\n" " for( int outPlane = 0; outPlane < gNumFilters; outPlane++ ) {\n" " for( int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++ ) {\n" " int outRow = upstreamRow + gMargin - filterRow;\n" " for( int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++ ) {\n" " int outCol = upstreamCol + gMargin - filterCol;\n" " int resultIndex = ( ( n * gNumFilters\n" " + outPlane ) * gOutputImageSize\n" " + outRow ) * gOutputImageSize\n" " + outCol;\n" " float thisError = gradOutput[resultIndex];\n" " int thisWeightIndex = ( ( outPlane * gInputPlanes\n" " + upstreamPlane ) * gFilterSize\n" " + filterRow ) * gFilterSize\n" " + filterCol;\n" " float thisWeight = weights[thisWeightIndex];\n" " float thisWeightTimesError = thisWeight * thisError;\n" " sumWeightTimesOutError += thisWeightTimesError;\n" " }\n" " }\n" " }\n" " gradInput[globalId] = sumWeightTimesOutError;\n" "}\n" "\n" ""; kernel = cl->buildKernelFromString( kernelSource, "calcGradInput", options, "cl/backward.cl" ); // [[[end]]] // kernel = cl->buildKernel( "backproperrorsv2.cl", "calcGradInput", options ); // kernel = cl->buildKernelFromString( kernelSource, "calcGradInput", options ); }
Forward1::Forward1( EasyCL *cl, LayerDimensions dim ) : Forward( cl, dim ) { addBias = new AddBias( cl ); std::string options = ""; options += dim.buildOptionsString(); // [[[cog // import stringify // stringify.write_kernel2( "kernel", "cl/forward1.cl", "convolve_imagecubes_float2", 'options' ) // ]]] // generated using cog, from cl/forward1.cl: const char * kernelSource = "// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n" "//\n" "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" "// obtain one at http://mozilla.org/MPL/2.0/.\n" "\n" "// notes on non-odd filtersizes:\n" "// for odd, imagesize and filtersize 3, padZeros = 0:\n" "// output is a single square\n" "// m and n should vary between -1,0,1\n" "// for even, imagesize and filtersize 2, padzeros = 0\n" "// output is a single square, which we can position at topleft or bottomrigth\n" "// lets position it in bottomright\n" "// then m and n should vary as -1,0\n" "//\n" "// for even, imagesize and filtersize 2, padzeros = 1\n" "// output is 2 by 2\n" "// well... if it is even:\n" "// - if we are not padding zeros, then we simply move our filter around the image somehow\n" "// - if we are padding zeros, then we conceptually pad the bottom and right edge of the image with zeros by 1\n" "// filtersize remains the same\n" "// m will vary as -1,0,1\n" "// outputrow is fixed by globalid\n" "// inputrow should be unchanged...\n" "// padzeros = 0:\n" "// x x . . . .\n" "// x x . . x x\n" "// . . . . x x\n" "// when filtersize even:\n" "// new imagesize = oldimagesize - filtersize + 1\n" "// when filtersize odd:\n" "// x x x .\n" "// x x x .\n" "// x x x .\n" "// . . . .\n" "// new imagesize = oldimagesize - filtersize + 1\n" "// padzeros = 1:\n" "// x x\n" "// x x . . x x . . . . . . .\n" "// . . . x x . . x x . . .\n" "// . . . . . . . x x . . x x\n" "// outrow=0 outrow=1 outrow=2 x x\n" "// outcol=0 outcol=1 outcol=2 outrow=3\n" "// outcol=3\n" "// when filtersize is even, and padzeros, imagesize grows by 1 each time...\n" "// imagesize = oldimagesize + 1\n" "// when filtersize is odd\n" "// x x x\n" "// x x x . x x x . . .\n" "// x x x . x x x . x x x\n" "// . . . x x x . x x x\n" "// x x x\n" "\n" "// images are organized like [imageId][plane][row][col]\n" "// filters are organized like [filterid][inplane][filterrow][filtercol]\n" "// output are organized like [imageid][filterid][row][col]\n" "// global id is organized like output, ie: [imageid][outplane][outrow][outcol]\n" "// - no local memory used currently\n" "// - each thread:\n" "// - loads a whole upstream cube\n" "// - loads a whole filter cube\n" "// - writes one output...\n" "void kernel convolve_imagecubes_float2(\n" " const int numExamples,\n" " global const float *inputs, global const float *filters,\n" " global float *output ) {\n" " int globalId = get_global_id(0);\n" "\n" " int outputImage2Id = globalId / gOutputImageSizeSquared;\n" " int exampleId = outputImage2Id / gNumFilters;\n" " int filterId = outputImage2Id % gNumFilters;\n" "\n" " // intraimage coords\n" " int localid = globalId % gOutputImageSizeSquared;\n" " int outputRow = localid / gOutputImageSize;\n" " int outputCol = localid % gOutputImageSize;\n" "\n" " global float const*inputCube = inputs + exampleId * gNumInputPlanes * gInputImageSizeSquared;\n" " global float const*filterCube = filters + filterId * gNumInputPlanes * gFilterSizeSquared;\n" "\n" " float sum = 0;\n" " if( exampleId < numExamples ) {\n" " for( int inputPlaneIdx = 0; inputPlaneIdx < gNumInputPlanes; inputPlaneIdx++ ) {\n" " global float const*inputPlane = inputCube + inputPlaneIdx * gInputImageSizeSquared;\n" " global float const*filterPlane = filterCube + inputPlaneIdx * gFilterSizeSquared;\n" " for( int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++ ) {\n" " // trying to reduce register pressure...\n" " #if gPadZeros == 1\n" " #define inputRowIdx ( outputRow + u )\n" " #else\n" " #define inputRowIdx ( outputRow + u + gHalfFilterSize )\n" " #endif\n" " global float const *inputRow = inputPlane + inputRowIdx * gInputImageSize;\n" " global float const *filterRow = filterPlane + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;\n" " bool rowOk = inputRowIdx >= 0 && inputRowIdx < gInputImageSize;\n" " #pragma unroll\n" " for( int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++ ) {\n" " #if gPadZeros == 1\n" " #define inputColIdx ( outputCol + v )\n" " #else\n" " #define inputColIdx ( outputCol + v + gHalfFilterSize )\n" " #endif\n" " bool process = rowOk && inputColIdx >= 0 && inputColIdx < gInputImageSize;\n" " if( process ) {\n" " sum += inputRow[inputColIdx] * filterRow[v];\n" " }\n" " }\n" " }\n" " }\n" " }\n" "\n" " if( exampleId < numExamples ) {\n" " output[globalId] = sum;\n" " }\n" "}\n" "\n" ""; kernel = cl->buildKernelFromString( kernelSource, "convolve_imagecubes_float2", options, "cl/forward1.cl" ); // [[[end]]] }
BackpropWeightsScratchLarge::BackpropWeightsScratchLarge(EasyCL *cl, LayerDimensions dim) : BackpropWeights(cl, dim) { if(square(dim.filterSize) > cl->getMaxWorkgroupSize()) { throw runtime_error("cannot use BackpropWeightsScratchLarge, since filterSize * filterSize > maxworkgroupsize"); } // [[[cog // import stringify // # stringify.write_kernel("kernelSource", "ClConvolve.cl") // ]]] // [[[end]]] // cout << "dim: " << dim << endl; std::string options = dim.buildOptionsString(); int localMemoryRequirementsFullImage = dim.inputSize * dim.inputSize * 4 + dim.outputSize * dim.outputSize * 4; int availableLocal = cl->getLocalMemorySize(); // cout << "localmemoryrequirementsfullimage: " << localMemoryRequirementsFullImage << endl; // cout << "availablelocal: " << availableLocal << endl; // make the local memory used about one quarter of what is available? half of what is available? // let's try one quarter :-) int localWeCanUse = availableLocal / 4; numStripes = (localMemoryRequirementsFullImage + localWeCanUse - 1) / localWeCanUse; // cout << "numStripes: " << numStripes << endl; // make it a power of 2 numStripes = EasyCL::getNextPower2(numStripes); // cout << "numStripes: " << numStripes << endl; int inputStripeMarginRows = dim.filterSize - 1; int inputStripeInnerNumRows = dim.inputSize / numStripes; int inputStripeOuterNumRows = inputStripeInnerNumRows + 2 * inputStripeMarginRows; int inputStripeInnerSize = inputStripeInnerNumRows * dim.inputSize; inputStripeOuterSize = inputStripeOuterNumRows * dim.inputSize; int inputStripeMarginSize = inputStripeMarginRows * dim.inputSize; int outputStripeNumRows = (dim.outputSize + numStripes - 1) / numStripes; outputStripeSize = outputStripeNumRows * dim.outputSize; // [[[cog // import cog_optionswriter // cog_optionswriter.write_options(['numStripes','inputStripeMarginRows','inputStripeInnerNumRows', // 'inputStripeOuterNumRows', 'inputStripeInnerSize', 'inputStripeOuterSize', 'inputStripeMarginSize', // 'outputStripeNumRows', 'outputStripeSize' ]) // ]]] // generated, using cog: options += " -DgNumStripes=" + toString(numStripes); options += " -DgInputStripeMarginRows=" + toString(inputStripeMarginRows); options += " -DgInputStripeInnerNumRows=" + toString(inputStripeInnerNumRows); options += " -DgInputStripeOuterNumRows=" + toString(inputStripeOuterNumRows); options += " -DgInputStripeInnerSize=" + toString(inputStripeInnerSize); options += " -DgInputStripeOuterSize=" + toString(inputStripeOuterSize); options += " -DgInputStripeMarginSize=" + toString(inputStripeMarginSize); options += " -DgOutputStripeNumRows=" + toString(outputStripeNumRows); options += " -DgOutputStripeSize=" + toString(outputStripeSize); // [[[end]]] cout << "options: " << options << endl; // [[[cog // import stringify // stringify.write_kernel2("kernel", "cl/BackpropWeightsScratchLarge.cl", "backprop_floats_withscratch_dobias_striped", 'options') // ]]] // generated using cog, from cl/BackpropWeightsScratchLarge.cl: const char * kernelSource = "// Copyright Hugh Perkins 2014,2015 hughperkins at gmail\n" "//\n" "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" "// obtain one at http://mozilla.org/MPL/2.0/.\n" "\n" "// expected defines:\n" "// BIASED (or not)\n" "\n" "// workgroupId: [outputPlane][inputPlane]\n" "// localId: [filterRow][filterCol]\n" "// per-thread iteration: [n][outputRow][outputCol]\n" "// local: errorimage: outputSize * outputSize\n" "// imageimage: inputSize * inputSize\n" "// specific characteristic: load one stripe of each image at a time,\n" "// so we dont run out of memory\n" "// number of stripes set in: gNumStripes\n" "// note that whilst we can stripe the gradOutput simply,\n" "// we actually need to add a half-filter widthed additional few rows\n" "// onto the images stripe, otherwise we will be missing data\n" "// we will call the size of the non-overlapping image stripes: gInputStripeInnerSize\n" "// the outersize, including the two margins is: gInputStripeOuterSize\n" "// of course, the first and last stripes will be missing a bit off the top/bottom, where the\n" "// corresponding outer margin would be\n" "void kernel backprop_floats_withscratch_dobias_striped(\n" " const float learningRateMultiplier, const int batchSize,\n" " global const float *gradOutput, global const float *images,\n" " global float *gradWeights,\n" " #ifdef BIASED\n" " global float *gradBiasWeights,\n" " #endif\n" " local float *_errorStripe, local float *_imageStripe\n" " ) {\n" " // gHalfFilterSize\n" " // gInputSize\n" " //\n" " // gInputStripeMarginRows => basically equal to gHalfFilterSize\n" " // gInputStripeInnerNumRows = gInputSize / gNumStripes\n" " // gInputStripeOuterNumRows = gInputStripeInnerNumRows + 2 * gHalfFilterSize (note: one row less than\n" " // if we just added gFilterSize)\n" " // gInputStripeInnerSize = gInputStripeInnerNumRows * gInputSize\n" " // gInputStripeOuterSize = gInputStripeOuterNumRows * gInputSize\n" " // gInputStripeMarginSize = gInputStripeMarginRows * gInputSize\n" " //\n" " // gOutputStripeNumRows\n" " // gOutputStripeSize\n" "\n" " const int globalId = get_global_id(0);\n" " const int localId = get_local_id(0);\n" " const int workgroupId = get_group_id(0);\n" " const int workgroupSize = get_local_size(0);\n" "\n" " const int filterRow = localId / gFilterSize;\n" " const int filterCol = localId % gFilterSize;\n" "\n" " const int outPlane = workgroupId / gInputPlanes;\n" " const int upstreamPlane = workgroupId % gInputPlanes;\n" "\n" " // gradWeights: [outPlane][upstreamPlane][filterRow][filterCol]\n" " // aggregate over: [outRow][outCol][n]\n" " float thiswchange = 0;\n" "#ifdef BIASED\n" " float thisbiaschange = 0;\n" "#endif\n" " const int numLoopsForImageStripe = (gInputStripeOuterSize + workgroupSize - 1) / workgroupSize;\n" " const int numLoopsForErrorStripe = (gOutputSizeSquared + workgroupSize - 1) / workgroupSize;\n" " for (int n = 0; n < batchSize; n++) {\n" " const int imageImageGlobalOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;\n" " const int imageImageGlobalOffsetAfter = imageImageGlobalOffset + gInputSizeSquared;\n" " const int errorImageGlobalOffset = (n * gNumFilters + outPlane) * gOutputSizeSquared;\n" " const int errorImageGlobalOffsetAfter = errorImageGlobalOffset + gOutputSizeSquared;\n" " for (int stripe = 0; stripe < gNumStripes; stripe++) {\n" " const int imageStripeInnerOffset = imageImageGlobalOffset + stripe * gInputStripeInnerSize;\n" " const int imageStripeOuterOffset = imageStripeInnerOffset - gInputStripeMarginSize;\n" " // need to fetch the image, but it's bigger than us, so will need to loop...\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" " for (int i = 0; i < numLoopsForImageStripe; i++) {\n" " int thisOffset = i * workgroupSize + localId;\n" " int thisGlobalImagesOffset = imageStripeOuterOffset + thisOffset;\n" " bool process = thisOffset < gInputStripeOuterSize\n" " && thisGlobalImagesOffset >= imageImageGlobalOffset\n" " && thisGlobalImagesOffset < imageImageGlobalOffsetAfter;\n" " if (process) {\n" " _imageStripe[thisOffset] = images[ thisGlobalImagesOffset ];\n" " }\n" " }\n" " int errorStripeOffset = errorImageGlobalOffset + stripe * gOutputStripeSize;\n" " for (int i = 0; i < numLoopsForErrorStripe; i++) {\n" " int thisOffset = i * workgroupSize + localId;\n" " int globalErrorsOffset = errorStripeOffset + thisOffset;\n" " bool process = thisOffset < gOutputStripeSize\n" " && globalErrorsOffset < errorImageGlobalOffsetAfter;\n" " if (process) {\n" " _errorStripe[thisOffset ] = gradOutput[globalErrorsOffset];\n" " }\n" " }\n" " const int stripeOutRowStart = stripe * gOutputStripeNumRows;\n" " const int stripeOutRowEndExcl = stripeOutRowStart + gOutputStripeNumRows;\n" " barrier(CLK_LOCAL_MEM_FENCE);\n" "// if (localId == 13) {\n" "// for (int i = 0; i < 12; i++) {\n" "// gradWeights[100 + stripe * 12 + i ] = _errorStripe[i * gOutputSize];\n" "// }\n" "// for (int i = 0; i < 20; i++) {\n" "// gradWeights[200 + stripe * 20 + i ] = _imageStripe[i * gInputSize];\n" "// }\n" "// }\n" " if (localId < gFilterSizeSquared) {\n" " for (int outRow = stripeOutRowStart; outRow < stripeOutRowEndExcl; outRow++) {\n" " int upstreamRow = outRow - gMargin + filterRow;\n" " for (int outCol = 0; outCol < gOutputSize; outCol++) {\n" " int upstreamCol = outCol - gMargin + filterCol;\n" " bool proceed =\n" " upstreamRow >= 0 && upstreamCol >= 0\n" " && upstreamRow < gInputSize && upstreamCol < gInputSize\n" " && outRow < gOutputSize;\n" " if (proceed) {\n" " int resultIndex = outRow * gOutputSize + outCol;\n" " float error = _errorStripe[resultIndex - stripe * gOutputStripeSize];\n" " int upstreamDataIndex = upstreamRow * gInputSize + upstreamCol;\n" " float upstreamResult = _imageStripe[upstreamDataIndex + gInputStripeMarginSize\n" " - stripe * gInputStripeInnerSize ];\n" " thiswchange += upstreamResult * error;\n" " #ifdef BIASED\n" " thisbiaschange += error;\n" " #endif\n" " }\n" " }\n" " }\n" " }\n" " }\n" " }\n" " if (localId < gFilterSizeSquared) {\n" " gradWeights[ workgroupId * gFilterSizeSquared + localId ] = learningRateMultiplier * thiswchange;\n" "// weightChanges[ workgroupId * gFilterSizeSquared + localId ] = workgroupId;\n" " }\n" "#ifdef BIASED\n" " bool writeBias = upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin;\n" " if (writeBias) {\n" " gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;\n" " }\n" "#endif\n" " // gradWeights: [outPlane][upstreamPlane][filterRow][filterCol]\n" " // aggregate over: [outRow][outCol][n]\n" "}\n" "\n" ""; kernel = cl->buildKernelFromString(kernelSource, "backprop_floats_withscratch_dobias_striped", options, "cl/BackpropWeightsScratchLarge.cl"); // [[[end]]] }
TEST( testforward, compare_break1_0_4 ) { LayerDimensions dim; dim.setInputPlanes( 1 ).setInputSize( 33 ).setNumFilters( 1 ).setFilterSize( 1 ) .setPadZeros( false ).setBiased( false ); compareSpecific( false, 1, 1, dim, 0, 4 ); }