Example #1
0
TEST(SLOW_testbackward, perf_kgsgo_32c5) {
    int batchSize = 128;
    LayerDimensions dim;
    dim.setInputPlanes(32).setInputSize(19).setNumFilters(32).setFilterSize(5)
        .setPadZeros(true).setBiased(true);  
    cout << dim.buildOptionsString() << endl;  
//    ActivationFunction *fn = new ReluActivation();

    measurePerf(2, batchSize, dim);
}
Example #2
0
TEST(SLOW_testbackward, compare_kgsgo_32c5mini) {
    int batchSize = 4;
    LayerDimensions dim;
    dim.setInputPlanes(2).setInputSize(3).setNumFilters(2).setFilterSize(3)
        .setPadZeros(true).setBiased(true);  
    cout << dim.buildOptionsString() << endl;  
//    ActivationFunction *fn = new ReluActivation();

    compareSpecific(1, 2, 1, batchSize, dim);

}
BackpropWeightsScratchBias::BackpropWeightsScratchBias( OpenCLHelper *cl, LayerDimensions dim, ActivationFunction const *fn ) :
        BackpropWeights( cl, dim, fn )
            {
    // [[[cog
    // import stringify
    // # stringify.write_kernel( "kernelSource", "ClConvolve.cl")
    // ]]]
    // [[[end]]]
    std::string options = dim.buildOptionsString();
    options += " -D " + fn->getDefineName();
    kernel = cl->buildKernel( "backpropweights.cl", "backprop_floats_withscratch_dobias", options );
//    kernel = cl->buildKernelFromString( kernelSource, "calcErrorsForUpstream", options );
}
Example #4
0
TEST(testbackward, compare_1_n_kgsgo_32c5) {
    int batchSize = 8;
    LayerDimensions dim;
    dim.setInputPlanes(32).setInputSize(19).setNumFilters(32).setFilterSize(5)
        .setPadZeros(true).setBiased(true);  
    cout << dim.buildOptionsString() << endl;  
//    ActivationFunction *fn = new ReluActivation();

    compareSpecific(0, 1, 1, batchSize, dim);
    for(int i=2; i < Backward::getNumImplementations(); i++) {
        compareSpecific(1, i, 1, batchSize, dim);
    }
}
Example #5
0
void testPerf( int instance, int N, int batchSize, LayerDimensions dim ) {
    cout << dim.buildOptionsString() << endl;  

    int inputsSize = batchSize * dim.inputCubeSize;
    int filtersSize = dim.filtersSize;
    int biasSize = dim.numFilters;
    int inputsAllocated = std::max( inputsSize, 10000 );
    int filtersAllocated = std::max( filtersSize, 10000 );
    int biasFiltersAllocated = std::max( biasSize, 10000 );
    float *inputs = new float[ inputsAllocated ];
    float *filters = new float[ filtersAllocated ];
    float *biasFilters = new float[ biasFiltersAllocated ];

    memset( inputs, 0, sizeof(float) * inputsAllocated );
    memset( filters, 0, sizeof(float) * filtersAllocated );
    memset( biasFilters, 0, sizeof(float) * biasFiltersAllocated );

    WeightRandomizer::randomize( inputs, inputsAllocated, -0.1f, 0.1f );
    WeightRandomizer::randomize( filters, filtersAllocated, -0.1f, 0.1f );
    WeightRandomizer::randomize( biasFilters, biasFiltersAllocated, -0.1f, 0.1f );

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    Forward *p1 = Forward::instanceSpecific( instance, cl, dim );
    for( int it = 0; it < (N + batchSize - 1 ) / batchSize; it++ ) {
        int thisBatchSize = it < N - 1 ? batchSize : N - batchSize * it;
        float *output1 = new float[p1->getOutputTotalSize(thisBatchSize)];
        p1->forward( thisBatchSize, inputs, filters, biasFilters, output1 );
        delete[] output1;
    }
    StatefulTimer::dump(true);

    delete p1;
    delete cl;
    delete[] inputs;
    delete[] filters;
    delete[] biasFilters;
}
Example #6
0
void testPerf( int instance, int N, int batchSize, LayerDimensions dim, ActivationFunction *fn ) {
    cout << dim.buildOptionsString() << endl;  

    int inputsSize = batchSize * dim.inputCubeSize;
    int filtersSize = dim.filtersSize;
    int biasSize = dim.numFilters;
    int inputsAllocated = std::max( inputsSize, 10000 );
    int filtersAllocated = std::max( filtersSize, 10000 );
    int biasFiltersAllocated = std::max( biasSize, 10000 );
    float *inputs = new float[ inputsAllocated ];
    float *filters = new float[ filtersAllocated ];
    float *biasFilters = new float[ biasFiltersAllocated ];

    memset( inputs, 0, sizeof(float) * inputsAllocated );
    memset( filters, 0, sizeof(float) * filtersAllocated );
    memset( biasFilters, 0, sizeof(float) * biasFiltersAllocated );

    WeightRandomizer::randomize( inputs, inputsAllocated, -0.1f, 0.1f );
    WeightRandomizer::randomize( filters, filtersAllocated, -0.1f, 0.1f );
    WeightRandomizer::randomize( biasFilters, biasFiltersAllocated, -0.1f, 0.1f );

    OpenCLHelper *cl = OpenCLHelper::createForFirstGpuOtherwiseCpu();
    Propagate *p1 = Propagate::instanceSpecific( instance, cl, dim, fn );
    for( int it = 0; it < (N + batchSize - 1 ) / batchSize; it++ ) {
        int thisBatchSize = it < N - 1 ? batchSize : N - batchSize * it;
        float *results1 = p1->propagate( thisBatchSize, inputs, filters, biasFilters );
        delete[] results1;
    }
    StatefulTimer::dump(true);

    delete p1;
    delete cl;
    delete[] inputs;
    delete[] filters;
    delete[] biasFilters;
}
Example #7
0
BackwardGpuNaive::BackwardGpuNaive( EasyCL *cl, LayerDimensions dim ) :
        Backward( cl, dim )
            {
    std::string options = dim.buildOptionsString();
    options += ""; // " -D " + upstreamFn->getDefineName();
    // [[[cog
    // import stringify
    // stringify.write_kernel2( "kernel", "cl/backward.cl", "calcGradInput", 'options' )
    // # stringify.write_kernel2( "broadcastMultiply", "cl/backproperrorsv2.cl", "broadcast_multiply", 'options' )
    // # stringify.write_kernel2( "applyActivationDeriv", "cl/applyActivationDeriv.cl", "applyActivationDeriv", 'options' )
    // # stringify.write_kernel( "kernelSource", "ClConvolve.cl")
    // ]]]
    // generated using cog, from cl/backward.cl:
    const char * kernelSource =  
    "// Copyright Hugh Perkins 2014 hughperkins at gmail\n" 
    "//\n" 
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" 
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
    "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
    "\n" 
    "// expected defines:\n" 
    "//  - none\n" 
    "\n" 
    "// globalid as: [n][upstreamPlane][upstreamrow][upstreamcol]\n" 
    "// inputdata: [n][upstreamPlane][upstreamrow][upstreamcol] 128 * 32 * 19 * 19 * 4 = 6MB\n" 
    "// gradOutput: [n][outPlane][outRow][outCol] 128 * 32 * 19 * 19 * 4 = 6MB\n" 
    "// weights: [filterId][inputPlane][filterRow][filterCol] 32 * 32 * 5 * 5 * 4 = 409KB\n" 
    "void kernel calcGradInput(\n" 
    "        const int batchSize,\n" 
    "        global const float *gradOutput, global float *weights, global float *gradInput ) {\n" 
    "    int globalId = get_global_id(0);\n" 
    "\n" 
    "    const int upstreamImage2dId = globalId / gInputImageSizeSquared;\n" 
    "\n" 
    "    const int intraImageOffset = globalId % gInputImageSizeSquared;\n" 
    "    const int upstreamRow = intraImageOffset / gInputImageSize;\n" 
    "    const int upstreamCol = intraImageOffset % gInputImageSize;\n" 
    "\n" 
    "    const int upstreamPlane = upstreamImage2dId % gInputPlanes;\n" 
    "    const int n = upstreamImage2dId / gInputPlanes;\n" 
    "\n" 
    "    if( n >= batchSize ) {\n" 
    "        return;\n" 
    "    }\n" 
    "\n" 
    "    const int minFilterRow = max( 0, upstreamRow + gMargin - (gOutputImageSize - 1) );\n" 
    "    const int maxFilterRow = min( gFilterSize - 1, upstreamRow + gMargin );\n" 
    "    const int minFilterCol = max( 0, upstreamCol + gMargin - (gOutputImageSize -1) );\n" 
    "    const int maxFilterCol = min( gFilterSize - 1, upstreamCol + gMargin );\n" 
    "\n" 
    "    float sumWeightTimesOutError = 0;\n" 
    "    // aggregate over [outPlane][outRow][outCol]\n" 
    "    for( int outPlane = 0; outPlane < gNumFilters; outPlane++ ) {\n" 
    "        for( int filterRow = minFilterRow; filterRow <= maxFilterRow; filterRow++ ) {\n" 
    "            int outRow = upstreamRow + gMargin - filterRow;\n" 
    "            for( int filterCol = minFilterCol; filterCol <= maxFilterCol; filterCol++ ) {\n" 
    "                int outCol = upstreamCol + gMargin - filterCol;\n" 
    "                int resultIndex = ( ( n * gNumFilters\n" 
    "                          + outPlane ) * gOutputImageSize\n" 
    "                          + outRow ) * gOutputImageSize\n" 
    "                          + outCol;\n" 
    "                float thisError = gradOutput[resultIndex];\n" 
    "                int thisWeightIndex = ( ( outPlane * gInputPlanes\n" 
    "                                    + upstreamPlane ) * gFilterSize\n" 
    "                                    + filterRow ) * gFilterSize\n" 
    "                                    + filterCol;\n" 
    "                float thisWeight = weights[thisWeightIndex];\n" 
    "                float thisWeightTimesError = thisWeight * thisError;\n" 
    "                sumWeightTimesOutError += thisWeightTimesError;\n" 
    "            }\n" 
    "        }\n" 
    "    }\n" 
    "    gradInput[globalId] = sumWeightTimesOutError;\n" 
    "}\n" 
    "\n" 
    "";
    kernel = cl->buildKernelFromString( kernelSource, "calcGradInput", options, "cl/backward.cl" );
    // [[[end]]]
//    kernel = cl->buildKernel( "backproperrorsv2.cl", "calcGradInput", options );
//    kernel = cl->buildKernelFromString( kernelSource, "calcGradInput", options );
}
Example #8
0
Forward1::Forward1( EasyCL *cl, LayerDimensions dim ) :
            Forward( cl, dim )
        {
    addBias = new AddBias( cl );

    std::string options = "";
    options += dim.buildOptionsString();

    // [[[cog
    // import stringify
    // stringify.write_kernel2( "kernel", "cl/forward1.cl", "convolve_imagecubes_float2", 'options' )
    // ]]]
    // generated using cog, from cl/forward1.cl:
    const char * kernelSource =  
    "// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n" 
    "//\n" 
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" 
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
    "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
    "\n" 
    "// notes on non-odd filtersizes:\n" 
    "// for odd, imagesize and filtersize 3, padZeros = 0:\n" 
    "// output is a single square\n" 
    "// m and n should vary between -1,0,1\n" 
    "// for even, imagesize and filtersize 2, padzeros = 0\n" 
    "// output is a single square, which we can position at topleft or bottomrigth\n" 
    "// lets position it in bottomright\n" 
    "// then m and n should vary as -1,0\n" 
    "//\n" 
    "// for even, imagesize and filtersize 2, padzeros = 1\n" 
    "// output is 2 by 2\n" 
    "// well... if it is even:\n" 
    "// - if we are not padding zeros, then we simply move our filter around the image somehow\n" 
    "// - if we are padding zeros, then we conceptually pad the bottom and right edge of the image with zeros by 1\n" 
    "// filtersize remains the same\n" 
    "//      m will vary as -1,0,1\n" 
    "//       outputrow is fixed by globalid\n" 
    "//       inputrow should be unchanged...\n" 
    "// padzeros = 0:\n" 
    "//  x x .  . . .\n" 
    "//  x x .  . x x\n" 
    "//  . . .  . x x\n" 
    "// when filtersize even:\n" 
    "//    new imagesize = oldimagesize - filtersize + 1\n" 
    "// when filtersize odd:\n" 
    "//    x x x .\n" 
    "//    x x x .\n" 
    "//    x x x .\n" 
    "//    . . . .\n" 
    "//    new imagesize = oldimagesize - filtersize + 1\n" 
    "// padzeros = 1:\n" 
    "// x x\n" 
    "// x x . .   x x .    . . .     . . .\n" 
    "//   . . .   x x .    . x x     . . .\n" 
    "//   . . .   . . .    . x x     . . x x\n" 
    "// outrow=0 outrow=1  outrow=2      x x\n" 
    "// outcol=0 outcol=1  outcol=2    outrow=3\n" 
    "//                                outcol=3\n" 
    "// when filtersize is even, and padzeros, imagesize grows by 1 each time...\n" 
    "//    imagesize = oldimagesize + 1\n" 
    "// when filtersize is odd\n" 
    "//  x x x\n" 
    "//  x x x .   x x x    . . .\n" 
    "//  x x x .   x x x    . x x x\n" 
    "//    . . .   x x x    . x x x\n" 
    "//                       x x x\n" 
    "\n" 
    "// images are organized like [imageId][plane][row][col]\n" 
    "// filters are organized like [filterid][inplane][filterrow][filtercol]\n" 
    "// output are organized like [imageid][filterid][row][col]\n" 
    "// global id is organized like output, ie: [imageid][outplane][outrow][outcol]\n" 
    "// - no local memory used currently\n" 
    "// - each thread:\n" 
    "//     - loads a whole upstream cube\n" 
    "//     - loads a whole filter cube\n" 
    "//     - writes one output...\n" 
    "void kernel convolve_imagecubes_float2(\n" 
    "    const int numExamples,\n" 
    "      global const float *inputs, global const float *filters,\n" 
    "    global float *output ) {\n" 
    "    int globalId = get_global_id(0);\n" 
    "\n" 
    "    int outputImage2Id = globalId / gOutputImageSizeSquared;\n" 
    "    int exampleId = outputImage2Id / gNumFilters;\n" 
    "    int filterId = outputImage2Id % gNumFilters;\n" 
    "\n" 
    "    // intraimage coords\n" 
    "    int localid = globalId % gOutputImageSizeSquared;\n" 
    "    int outputRow = localid / gOutputImageSize;\n" 
    "    int outputCol = localid % gOutputImageSize;\n" 
    "\n" 
    "    global float const*inputCube = inputs + exampleId * gNumInputPlanes * gInputImageSizeSquared;\n" 
    "    global float const*filterCube = filters + filterId * gNumInputPlanes * gFilterSizeSquared;\n" 
    "\n" 
    "    float sum = 0;\n" 
    "    if( exampleId < numExamples ) {\n" 
    "        for( int inputPlaneIdx = 0; inputPlaneIdx < gNumInputPlanes; inputPlaneIdx++ ) {\n" 
    "            global float const*inputPlane = inputCube + inputPlaneIdx * gInputImageSizeSquared;\n" 
    "            global float const*filterPlane = filterCube + inputPlaneIdx * gFilterSizeSquared;\n" 
    "            for( int u = -gHalfFilterSize; u <= gHalfFilterSize - gEven; u++ ) {\n" 
    "                // trying to reduce register pressure...\n" 
    "                #if gPadZeros == 1\n" 
    "                    #define inputRowIdx ( outputRow + u )\n" 
    "                #else\n" 
    "                    #define inputRowIdx ( outputRow + u + gHalfFilterSize )\n" 
    "                #endif\n" 
    "                global float const *inputRow = inputPlane + inputRowIdx * gInputImageSize;\n" 
    "                global float const *filterRow = filterPlane + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;\n" 
    "                bool rowOk = inputRowIdx >= 0 && inputRowIdx < gInputImageSize;\n" 
    "                #pragma unroll\n" 
    "                for( int v = -gHalfFilterSize; v <= gHalfFilterSize - gEven; v++ ) {\n" 
    "                    #if gPadZeros == 1\n" 
    "                        #define inputColIdx ( outputCol + v )\n" 
    "                    #else\n" 
    "                        #define inputColIdx ( outputCol + v + gHalfFilterSize )\n" 
    "                    #endif\n" 
    "                    bool process = rowOk && inputColIdx >= 0 && inputColIdx < gInputImageSize;\n" 
    "                    if( process ) {\n" 
    "                            sum += inputRow[inputColIdx] * filterRow[v];\n" 
    "                    }\n" 
    "                }\n" 
    "            }\n" 
    "        }\n" 
    "    }\n" 
    "\n" 
    "    if( exampleId < numExamples ) {\n" 
    "        output[globalId] = sum;\n" 
    "    }\n" 
    "}\n" 
    "\n" 
    "";
    kernel = cl->buildKernelFromString( kernelSource, "convolve_imagecubes_float2", options, "cl/forward1.cl" );
    // [[[end]]]
}
BackpropWeightsScratchLarge::BackpropWeightsScratchLarge(EasyCL *cl, LayerDimensions dim) :
        BackpropWeights(cl, dim)
            {
    if(square(dim.filterSize) > cl->getMaxWorkgroupSize()) {
        throw runtime_error("cannot use BackpropWeightsScratchLarge, since filterSize * filterSize > maxworkgroupsize");
    }

    // [[[cog
    // import stringify
    // # stringify.write_kernel("kernelSource", "ClConvolve.cl")
    // ]]]
    // [[[end]]]
//    cout << "dim: " << dim << endl;
    std::string options = dim.buildOptionsString();

    int localMemoryRequirementsFullImage = dim.inputSize * dim.inputSize * 4 + dim.outputSize * dim.outputSize * 4;
    int availableLocal = cl->getLocalMemorySize();
//    cout << "localmemoryrequirementsfullimage: " << localMemoryRequirementsFullImage << endl;
//    cout << "availablelocal: " << availableLocal << endl;
    // make the local memory used about one quarter of what is available? half of what is available?
    // let's try one quarter :-)
    int localWeCanUse = availableLocal / 4;
    numStripes = (localMemoryRequirementsFullImage + localWeCanUse - 1) / localWeCanUse;
//    cout << "numStripes: " << numStripes << endl;
    // make it a power of 2
    numStripes = EasyCL::getNextPower2(numStripes);
//    cout << "numStripes: " << numStripes << endl;

    int inputStripeMarginRows = dim.filterSize - 1;
    int inputStripeInnerNumRows = dim.inputSize / numStripes;
    int inputStripeOuterNumRows = inputStripeInnerNumRows + 2 * inputStripeMarginRows;

    int inputStripeInnerSize = inputStripeInnerNumRows * dim.inputSize;
    inputStripeOuterSize = inputStripeOuterNumRows * dim.inputSize;
    int inputStripeMarginSize = inputStripeMarginRows * dim.inputSize;

    int outputStripeNumRows = (dim.outputSize + numStripes - 1) / numStripes;
    outputStripeSize = outputStripeNumRows * dim.outputSize;

    // [[[cog
    // import cog_optionswriter
    // cog_optionswriter.write_options(['numStripes','inputStripeMarginRows','inputStripeInnerNumRows',
    //     'inputStripeOuterNumRows', 'inputStripeInnerSize', 'inputStripeOuterSize', 'inputStripeMarginSize',
    //     'outputStripeNumRows', 'outputStripeSize' ])
    // ]]]
    // generated, using cog:
    options += " -DgNumStripes=" + toString(numStripes);
    options += " -DgInputStripeMarginRows=" + toString(inputStripeMarginRows);
    options += " -DgInputStripeInnerNumRows=" + toString(inputStripeInnerNumRows);
    options += " -DgInputStripeOuterNumRows=" + toString(inputStripeOuterNumRows);
    options += " -DgInputStripeInnerSize=" + toString(inputStripeInnerSize);
    options += " -DgInputStripeOuterSize=" + toString(inputStripeOuterSize);
    options += " -DgInputStripeMarginSize=" + toString(inputStripeMarginSize);
    options += " -DgOutputStripeNumRows=" + toString(outputStripeNumRows);
    options += " -DgOutputStripeSize=" + toString(outputStripeSize);
    // [[[end]]]
    cout << "options: " << options << endl;

    // [[[cog
    // import stringify
    // stringify.write_kernel2("kernel", "cl/BackpropWeightsScratchLarge.cl", "backprop_floats_withscratch_dobias_striped", 'options')
    // ]]]
    // generated using cog, from cl/BackpropWeightsScratchLarge.cl:
    const char * kernelSource =  
    "// Copyright Hugh Perkins 2014,2015 hughperkins at gmail\n"
    "//\n"
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
    "// obtain one at http://mozilla.org/MPL/2.0/.\n"
    "\n"
    "// expected defines:\n"
    "// BIASED (or not)\n"
    "\n"
    "// workgroupId: [outputPlane][inputPlane]\n"
    "// localId: [filterRow][filterCol]\n"
    "// per-thread iteration: [n][outputRow][outputCol]\n"
    "// local: errorimage: outputSize * outputSize\n"
    "//        imageimage: inputSize * inputSize\n"
    "// specific characteristic: load one stripe of each image at a time,\n"
    "// so we dont run out of memory\n"
    "// number of stripes set in: gNumStripes\n"
    "// note that whilst we can stripe the gradOutput simply,\n"
    "// we actually need to add a half-filter widthed additional few rows\n"
    "// onto the images stripe, otherwise we will be missing data\n"
    "//   we will call the size of the non-overlapping image stripes: gInputStripeInnerSize\n"
    "//      the outersize, including the two margins is: gInputStripeOuterSize\n"
    "//      of course, the first and last stripes will be missing a bit off the top/bottom, where the\n"
    "//      corresponding outer margin would be\n"
    "void kernel backprop_floats_withscratch_dobias_striped(\n"
    "        const float learningRateMultiplier, const int batchSize,\n"
    "         global const float *gradOutput, global const float *images,\n"
    "        global float *gradWeights,\n"
    "        #ifdef BIASED\n"
    "             global float *gradBiasWeights,\n"
    "        #endif\n"
    "        local float *_errorStripe, local float *_imageStripe\n"
    " ) {\n"
    "    // gHalfFilterSize\n"
    "    // gInputSize\n"
    "    //\n"
    "    // gInputStripeMarginRows => basically equal to gHalfFilterSize\n"
    "    // gInputStripeInnerNumRows = gInputSize / gNumStripes\n"
    "    // gInputStripeOuterNumRows = gInputStripeInnerNumRows + 2 * gHalfFilterSize  (note: one row less than\n"
    "    //                                                         if we just added gFilterSize)\n"
    "    // gInputStripeInnerSize = gInputStripeInnerNumRows * gInputSize\n"
    "    // gInputStripeOuterSize = gInputStripeOuterNumRows * gInputSize\n"
    "    // gInputStripeMarginSize = gInputStripeMarginRows * gInputSize\n"
    "    //\n"
    "    // gOutputStripeNumRows\n"
    "    // gOutputStripeSize\n"
    "\n"
    "    const int globalId = get_global_id(0);\n"
    "    const int localId = get_local_id(0);\n"
    "    const int workgroupId = get_group_id(0);\n"
    "    const int workgroupSize = get_local_size(0);\n"
    "\n"
    "    const int filterRow = localId / gFilterSize;\n"
    "    const int filterCol = localId % gFilterSize;\n"
    "\n"
    "    const int outPlane = workgroupId / gInputPlanes;\n"
    "    const int upstreamPlane = workgroupId % gInputPlanes;\n"
    "\n"
    "    // gradWeights:     [outPlane][upstreamPlane][filterRow][filterCol]\n"
    "    //       aggregate over:  [outRow][outCol][n]\n"
    "    float thiswchange = 0;\n"
    "#ifdef BIASED\n"
    "    float thisbiaschange = 0;\n"
    "#endif\n"
    "    const int numLoopsForImageStripe = (gInputStripeOuterSize + workgroupSize - 1) / workgroupSize;\n"
    "    const int numLoopsForErrorStripe = (gOutputSizeSquared + workgroupSize - 1) / workgroupSize;\n"
    "    for (int n = 0; n < batchSize; n++) {\n"
    "        const int imageImageGlobalOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;\n"
    "        const int imageImageGlobalOffsetAfter = imageImageGlobalOffset + gInputSizeSquared;\n"
    "        const int errorImageGlobalOffset = (n * gNumFilters + outPlane) * gOutputSizeSquared;\n"
    "        const int errorImageGlobalOffsetAfter = errorImageGlobalOffset + gOutputSizeSquared;\n"
    "        for (int stripe = 0; stripe < gNumStripes; stripe++) {\n"
    "            const int imageStripeInnerOffset = imageImageGlobalOffset + stripe * gInputStripeInnerSize;\n"
    "            const int imageStripeOuterOffset = imageStripeInnerOffset - gInputStripeMarginSize;\n"
    "            // need to fetch the image, but it's bigger than us, so will need to loop...\n"
    "            barrier(CLK_LOCAL_MEM_FENCE);\n"
    "            for (int i = 0; i < numLoopsForImageStripe; i++) {\n"
    "                int thisOffset = i * workgroupSize + localId;\n"
    "                int thisGlobalImagesOffset = imageStripeOuterOffset + thisOffset;\n"
    "                bool process = thisOffset < gInputStripeOuterSize\n"
    "                    && thisGlobalImagesOffset >= imageImageGlobalOffset\n"
    "                    && thisGlobalImagesOffset < imageImageGlobalOffsetAfter;\n"
    "                if (process) {\n"
    "                    _imageStripe[thisOffset] = images[ thisGlobalImagesOffset ];\n"
    "                }\n"
    "            }\n"
    "            int errorStripeOffset = errorImageGlobalOffset + stripe * gOutputStripeSize;\n"
    "            for (int i = 0; i < numLoopsForErrorStripe; i++) {\n"
    "                int thisOffset = i * workgroupSize + localId;\n"
    "                int globalErrorsOffset = errorStripeOffset + thisOffset;\n"
    "                bool process = thisOffset < gOutputStripeSize\n"
    "                    && globalErrorsOffset < errorImageGlobalOffsetAfter;\n"
    "                if (process) {\n"
    "                    _errorStripe[thisOffset ] = gradOutput[globalErrorsOffset];\n"
    "                }\n"
    "            }\n"
    "            const int stripeOutRowStart = stripe * gOutputStripeNumRows;\n"
    "            const int stripeOutRowEndExcl = stripeOutRowStart + gOutputStripeNumRows;\n"
    "            barrier(CLK_LOCAL_MEM_FENCE);\n"
    "//            if (localId == 13) {\n"
    "//                for (int i = 0; i < 12; i++) {\n"
    "//                    gradWeights[100 + stripe * 12 + i ] = _errorStripe[i * gOutputSize];\n"
    "//                }\n"
    "//                for (int i = 0; i < 20; i++) {\n"
    "//                    gradWeights[200 + stripe * 20 + i ] = _imageStripe[i * gInputSize];\n"
    "//                }\n"
    "//            }\n"
    "            if (localId < gFilterSizeSquared) {\n"
    "                for (int outRow = stripeOutRowStart; outRow < stripeOutRowEndExcl; outRow++) {\n"
    "                    int upstreamRow = outRow - gMargin + filterRow;\n"
    "                    for (int outCol = 0; outCol < gOutputSize; outCol++) {\n"
    "                        int upstreamCol = outCol - gMargin + filterCol;\n"
    "                        bool proceed =\n"
    "                            upstreamRow >= 0 && upstreamCol >= 0\n"
    "                            && upstreamRow < gInputSize && upstreamCol < gInputSize\n"
    "                            && outRow < gOutputSize;\n"
    "                        if (proceed) {\n"
    "                            int resultIndex = outRow * gOutputSize + outCol;\n"
    "                            float error = _errorStripe[resultIndex - stripe * gOutputStripeSize];\n"
    "                            int upstreamDataIndex = upstreamRow * gInputSize + upstreamCol;\n"
    "                            float upstreamResult = _imageStripe[upstreamDataIndex +  gInputStripeMarginSize\n"
    "                                        - stripe * gInputStripeInnerSize ];\n"
    "                            thiswchange += upstreamResult * error;\n"
    "        #ifdef BIASED\n"
    "                            thisbiaschange += error;\n"
    "        #endif\n"
    "                        }\n"
    "                    }\n"
    "                }\n"
    "            }\n"
    "        }\n"
    "    }\n"
    "    if (localId < gFilterSizeSquared) {\n"
    "        gradWeights[ workgroupId * gFilterSizeSquared + localId ] = learningRateMultiplier * thiswchange;\n"
    "//        weightChanges[ workgroupId * gFilterSizeSquared + localId ] = workgroupId;\n"
    "    }\n"
    "#ifdef BIASED\n"
    "    bool writeBias = upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin;\n"
    "    if (writeBias) {\n"
    "        gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;\n"
    "    }\n"
    "#endif\n"
    "    // gradWeights:     [outPlane][upstreamPlane][filterRow][filterCol]\n"
    "    //       aggregate over:  [outRow][outCol][n]\n"
    "}\n"
    "\n"
    "";
    kernel = cl->buildKernelFromString(kernelSource, "backprop_floats_withscratch_dobias_striped", options, "cl/BackpropWeightsScratchLarge.cl");
    // [[[end]]]
}
Example #10
0
BackpropWeightsNaive::BackpropWeightsNaive(EasyCL *cl, LayerDimensions dim) :
        BackpropWeights(cl, dim)
            {
    std::string options = dim.buildOptionsString();

    // [[[cog
    // import stringify
    // stringify.write_kernel2("kernel", "cl/backpropweights.cl", "backprop_floats", 'options')
    // ]]]
    // generated using cog, from cl/backpropweights.cl:
    const char * kernelSource =  
    "// Copyright Hugh Perkins 2014,2015 hughperkins at gmail\n"
    "//\n"
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
    "// obtain one at http://mozilla.org/MPL/2.0/.\n"
    "\n"
    "// expected defines:\n"
    "// BIASED (or not)\n"
    "\n"
    "// globalId: [outPlane][inputPlane][filterRow][filterCol]\n"
    "// per-thread iteration: [n][outputRow][outputCol]\n"
    "void kernel backprop_floats(const float learningRateMultiplier,\n"
    "        const int batchSize,\n"
    "         global const float *gradOutput, global const float *images,\n"
    "        global float *gradWeights\n"
    "        #ifdef BIASED\n"
    "            , global float *gradBiasWeights\n"
    "        #endif\n"
    " ) {\n"
    "    int globalId = get_global_id(0);\n"
    "    if (globalId >= gNumFilters * gInputPlanes * gFilterSize * gFilterSize) {\n"
    "        return;\n"
    "    }\n"
    "\n"
    "    int IntraFilterOffset = globalId % gFilterSizeSquared;\n"
    "    int filterRow = IntraFilterOffset / gFilterSize;\n"
    "    int filterCol = IntraFilterOffset % gFilterSize;\n"
    "\n"
    "    int filter2Id = globalId / gFilterSizeSquared;\n"
    "    int outPlane = filter2Id / gInputPlanes;\n"
    "    int upstreamPlane = filter2Id % gInputPlanes;\n"
    "\n"
    "    float thiswchange = 0;\n"
    "    // gradWeights:     [outPlane][upstreamPlane][filterRow][filterCol]\n"
    "    //       aggregate over:  [outRow][outCol][n]\n"
    "#ifdef BIASED\n"
    "    float thisbiaschange = 0;\n"
    "#endif\n"
    "    for (int n = 0; n < batchSize; n++) {\n"
    "        for (int outRow = 0; outRow < gOutputSize; outRow++) {\n"
    "            int upstreamRow = outRow - gMargin + filterRow;\n"
    "            for (int outCol = 0; outCol < gOutputSize; outCol++) {\n"
    "                int upstreamCol = outCol - gMargin + filterCol;\n"
    "                bool proceed = upstreamRow >= 0 && upstreamCol >= 0 && upstreamRow < gInputSize\n"
    "                    && upstreamCol < gInputSize;\n"
    "                if (proceed) {\n"
    "                    int resultIndex = (( n * gNumFilters\n"
    "                              + outPlane) * gOutputSize\n"
    "                              + outRow) * gOutputSize\n"
    "                              + outCol;\n"
    "                    float error = gradOutput[resultIndex];\n"
    "                    int upstreamDataIndex = (( n * gInputPlanes\n"
    "                                     + upstreamPlane) * gInputSize\n"
    "                                     + upstreamRow) * gInputSize\n"
    "                                     + upstreamCol;\n"
    "                    float upstreamResult = images[upstreamDataIndex];\n"
    "                    float thisimagethiswchange = upstreamResult * error;\n"
    "                    thiswchange += thisimagethiswchange;\n"
    "    #ifdef BIASED\n"
    "                    thisbiaschange += error;\n"
    "    #endif\n"
    "                }\n"
    "            }\n"
    "        }\n"
    "    }\n"
    "    // gradWeights:     [outPlane][upstreamPlane][filterRow][filterCol]\n"
    "    //       aggregate over:  [outRow][outCol][n]\n"
    "    gradWeights[ globalId ] = learningRateMultiplier * thiswchange;\n"
    "#ifdef BIASED\n"
    "    bool writeBias = upstreamPlane == 0 && filterRow == gMargin && filterCol == gMargin;\n"
    "    if (writeBias) {\n"
    "        gradBiasWeights[outPlane] = learningRateMultiplier * thisbiaschange;\n"
    "    }\n"
    "#endif\n"
    "}\n"
    "\n"
    "\n"
    "\n"
    "";
    kernel = cl->buildKernelFromString(kernelSource, "backprop_floats", options, "cl/backpropweights.cl");
    // [[[end]]]
}
ForwardFc_workgroupPerFilterPlane::ForwardFc_workgroupPerFilterPlane( EasyCL *cl, LayerDimensions dim ) :
        Forward( cl, dim )
            {

    std::string options = ""; // "-D " + fn->getDefineName();
    options += dim.buildOptionsString();

    // [[[cog
    // import stringify
    // stringify.write_kernel2( "kernel1", "cl/forward_fc_wgperrow.cl", "forward_fc_workgroup_perrow", 'options' )
    // stringify.write_kernel2( "kernel2", "cl/forward_fc.cl", "reduce_rows", 'options' )
    // ]]]
    // generated using cog, from cl/forward_fc_wgperrow.cl:
    const char * kernel1Source =  
    "// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n" 
    "//\n" 
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" 
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
    "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
    "\n" 
    "void copyLocal( local float *restrict target, global float const *restrict source, int N ) {\n" 
    "    int numLoops = ( N + get_local_size(0) - 1 ) / get_local_size(0);\n" 
    "    for( int loop = 0; loop < numLoops; loop++ ) {\n" 
    "        int offset = loop * get_local_size(0) + get_local_id(0);\n" 
    "        if( offset < N ) {\n" 
    "            target[offset] = source[offset];\n" 
    "        }\n" 
    "    }\n" 
    "}\n" 
    "\n" 
    "// concept:\n" 
    "//  we want to share each input example across multiple filters\n" 
    "//   but an entire filter plane is 19*19*4 = 1.4KB\n" 
    "//   so eg 500 filter planes is 500* 1.4KB = 700KB, much larger than local storage\n" 
    "//   of ~43KB\n" 
    "//  - we could take eg 16 filters at a time, store one filter plane from each in local storage,\n" 
    "//  and then bring down one example plane at a time, into local storage, during iteration over n\n" 
    "//  - here though, we are going to store one row from one plane from each filter,\n" 
    "//  and process against one row, from same plane, from each example\n" 
    "//  so each workgroup will have one thread per filterId, eg 351 threads\n" 
    "//    each thread will add up over its assigned row\n" 
    "//  then, later we need to reduce over the rows\n" 
    "//   ... and also over the input planes?\n" 
    "//\n" 
    "// workgroupid [inputplane][filterrow]\n" 
    "// localid: [filterId]\n" 
    "//  each thread iterates over: [n][filtercol]\n" 
    "//  each thread is assigned to: one row, of one filter\n" 
    "//  workgroup is assigned to: same row, from each input plane\n" 
    "// local memory: one row from each output, = 128 * 19 * 4 = 9.8KB\n" 
    "//             1 * input row = \"0.076KB\"\n" 
    "// output1 structured as: [n][inputplane][filter][row], need to reduce again after\n" 
    "// this kernel assumes:\n" 
    "//   padzeros == 0 (mandatory)\n" 
    "//   filtersize == inputimagesize (mandatory)\n" 
    "//   inputimagesize == 19\n" 
    "//   filtersize == 19\n" 
    "//   outputImageSize == 1\n" 
    "//   lots of outplanes/filters, hundreds, but less than max work groupsize, eg 350, 500, 361\n" 
    "//   lots of inplanes, eg 32-128\n" 
    "//   inputimagesize around 19, not too small\n" 
    "#if (gFilterSize == gInputImageSize) && (gPadZeros == 0)\n" 
    "void kernel forward_fc_workgroup_perrow( const int batchSize,\n" 
    "    global const float *images, global const float *filters,\n" 
    "    global float *output1,\n" 
    "    local float *_imageRow, local float *_filterRows ) {\n" 
    "    const int globalId = get_global_id(0);\n" 
    "\n" 
    "    const int workgroupId = get_group_id(0);\n" 
    "    const int workgroupSize = get_local_size(0);\n" 
    "    const int localId = get_local_id(0);\n" 
    "\n" 
    "    const int inputPlaneId = workgroupId / gFilterSize;\n" 
    "    const int filterRowId = workgroupId % gFilterSize;\n" 
    "\n" 
    "    const int filterId = localId;\n" 
    "\n" 
    "    // first copy down filter row, which is per-thread, so we have to copy it all ourselves...\n" 
    "    global const float *filterRow = filters\n" 
    "        + filterId * gNumInputPlanes * gFilterSizeSquared\n" 
    "        + inputPlaneId * gFilterSizeSquared\n" 
    "        + filterRowId * gFilterSize;\n" 
    "    local float *_threadFilterRow = _filterRows + localId * gFilterSize;\n" 
    "    for( int i = 0; i < gFilterSize; i++ ) {\n" 
    "        _threadFilterRow[i] = filterRow[i];\n" 
    "    }\n" 
    "    const int loopsPerExample = ( gInputImageSize + workgroupSize - 1 ) / workgroupSize;\n" 
    "    // now loop over examples...\n" 
    "    for( int n = 0; n < batchSize; n++ ) {\n" 
    "        // copy down example row, which is global to all threads in workgroup\n" 
    "        // hopefully should be enough threads....\n" 
    "        // but we should check anyway really, since depends on number of filters configured,\n" 
    "        // not on relative size of filter and input image\n" 
    "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
    "        copyLocal( _imageRow,  images\n" 
    "            + ( ( n\n" 
    "                * gNumInputPlanes + inputPlaneId )\n" 
    "                * gInputImageSize + filterRowId )\n" 
    "                * gInputImageSize,\n" 
    "            gInputImageSize );\n" 
    "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
    "        // add up the values in our row...\n" 
    "        float sum = 0;\n" 
    "        for( int filterCol = 0; filterCol < gFilterSize; filterCol++ ) {\n" 
    "            sum += _imageRow[ filterCol ] * _threadFilterRow[ filterCol ];\n" 
    "        }\n" 
    "        // note: dont activate yet, since need to reduce again\n" 
    "        // output structured as: [n][filter][inputplane][filterrow], need to reduce again after\n" 
    "        if( localId < gNumFilters ) {\n" 
    "            output1[ n * gNumInputPlanes * gNumFilters * gFilterSize\n" 
    "                + inputPlaneId * gFilterSize\n" 
    "                + filterId * gNumInputPlanes * gFilterSize + filterRowId ] = sum;\n" 
    "        }\n" 
    "    }\n" 
    "}\n" 
    "#endif\n" 
    "\n" 
    "";
    kernel1 = cl->buildKernelFromString( kernel1Source, "forward_fc_workgroup_perrow", options, "cl/forward_fc_wgperrow.cl" );
    // generated using cog, from cl/forward_fc.cl:
    const char * kernel2Source =  
    "// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n" 
    "//\n" 
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" 
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
    "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
    "\n" 
    "// expected defines:\n" 
    "// one of: [ TANH | RELU | LINEAR ]\n" 
    "// BIASED (or not)\n" 
    "\n" 
    "#ifdef TANH\n" 
    "    #define ACTIVATION_FUNCTION(output) (tanh(output))\n" 
    "#elif defined SCALEDTANH\n" 
    "    #define ACTIVATION_FUNCTION(output) ( 1.7159f * tanh( 0.66667f * output))\n" 
    "#elif SIGMOID\n" 
    "    #define ACTIVATION_FUNCTION(output) (1.0f / (1 + exp(-output)))\n" 
    "#elif defined RELU\n" 
    "    #define ACTIVATION_FUNCTION(output) (output> 0 ? output : 0)\n" 
    "#elif defined LINEAR\n" 
    "    #define ACTIVATION_FUNCTION(output) (output)\n" 
    "#endif\n" 
    "\n" 
    "\n" 
    "// each thread handles one filter, ie globalId as [n][inputplane][filterId]\n" 
    "// output1: [n][inputplane][filter][filterrow]\n" 
    "// output2: [n][inputplane][filter]\n" 
    "#ifdef ACTIVATION_FUNCTION // protect against not defined\n" 
    "kernel void reduce_rows( const int batchSize, global float const *output1, global float*output2 ) {\n" 
    "    const int globalId = get_global_id(0);\n" 
    "    const int n = globalId / gNumInputPlanes / gNumFilters;\n" 
    "    if( n >= batchSize ) {\n" 
    "        return;\n" 
    "    }\n" 
    "    const int filterId = globalId % gNumFilters;\n" 
    "    float sum = 0;\n" 
    "    global const float *output1Col = output1 + globalId * gFilterSize;\n" 
    "    for( int filterRow = 0; filterRow < gFilterSize; filterRow++ ) {\n" 
    "        sum += output1Col[filterRow];\n" 
    "    }\n" 
    "    output2[globalId] = sum;\n" 
    "}\n" 
    "#endif\n" 
    "\n" 
    "// each thread handles one filter, ie globalId as [n][filterId]\n" 
    "// output2: [n][inputplane][filter]\n" 
    "// output: [n][filter]\n" 
    "#ifdef ACTIVATION_FUNCTION // protect against not defined\n" 
    "kernel void reduce_inputplanes( const int batchSize, global float const *output2, global float*output ) {\n" 
    "    const int globalId = get_global_id(0);\n" 
    "    const int n = globalId / gNumFilters;\n" 
    "    if( n >= batchSize ) {\n" 
    "        return;\n" 
    "    }\n" 
    "    const int filterId = globalId % gNumFilters;\n" 
    "    float sum = 0;\n" 
    "    global const float *output2Col = output2 + globalId * gNumInputPlanes;\n" 
    "    for( int inputPlane = 0; inputPlane < gNumInputPlanes; inputPlane++ ) {\n" 
    "        sum += output2Col[inputPlane];\n" 
    "    }\n" 
    "    // activate...\n" 
    "    output[globalId] = ACTIVATION_FUNCTION(sum);\n" 
    "}\n" 
    "#endif\n" 
    "\n" 
    "#ifdef gOutImageSize // for previous tests that dont define it\n" 
    "#ifdef ACTIVATION_FUNCTION // protect against not defined\n" 
    "// workgroupid [n][outputplane]\n" 
    "// localid: [filterrow][filtercol]\n" 
    "//  each thread iterates over: [inplane]\n" 
    "// this kernel assumes:\n" 
    "//   padzeros == 0 (mandatory)\n" 
    "//   filtersize == inputimagesize (mandatory)\n" 
    "//   outputImageSize == 1\n" 
    "//   lots of outplanes, hundreds, but less than max work groupsize, eg 350, 500, 361\n" 
    "//   lots of inplanes, eg 32\n" 
    "//   inputimagesize around 19, not too small\n" 
    "#if gFilterSize == gInputImageSize && gPadZeros == 0\n" 
    "void kernel forward_filter_matches_inimage( const int batchSize,\n" 
    "      global const float *images, global const float *filters,\n" 
    "        #ifdef BIASED\n" 
    "            global const float*biases,\n" 
    "        #endif\n" 
    "    global float *output,\n" 
    "    local float *_upstreamImage, local float *_filterImage ) {\n" 
    "    const int globalId = get_global_id(0);\n" 
    "\n" 
    "    const int workgroupId = get_group_id(0);\n" 
    "    const int workgroupSize = get_local_size(0);\n" 
    "    const int n = workgroupId / gNumOutPlanes;\n" 
    "    const int outPlane = workgroupId % gNumOutPlanes;\n" 
    "\n" 
    "    const int localId = get_local_id(0);\n" 
    "    const int filterRow = localId / gFilterSize;\n" 
    "    const int filterCol = localId % gFilterSize;\n" 
    "\n" 
    "    float sum = 0;\n" 
    "    for( int upstreamPlane = 0; upstreamPlane < gUpstreamNumPlanes; upstreamPlane++ ) {\n" 
    "        int thisUpstreamImageOffset = ( n * gUpstreamNumPlanes + upstreamPlane ) * gUpstreamImageSizeSquared;\n" 
    "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
    "        for( int i = 0; i < numUpstreamsPerThread; i++ ) {\n" 
    "            int thisOffset = workgroupSize * i + localId;\n" 
    "            if( thisOffset < gUpstreamImageSizeSquared ) {\n" 
    "                _upstreamImage[ thisOffset ] = images[ thisUpstreamImageOffset + thisOffset ];\n" 
    "            }\n" 
    "        }\n" 
    "        const int filterGlobalOffset = ( outPlane * gUpstreamNumPlanes + upstreamPlane ) * gFilterSizeSquared;\n" 
    "        for( int i = 0; i < numFilterPixelsPerThread; i++ ) {\n" 
    "            int thisOffset = workgroupSize * i + localId;\n" 
    "            if( thisOffset < gFilterSizeSquared ) {\n" 
    "                _filterCube[thisOffset] = filters[filterGlobalOffset + thisOffset];\n" 
    "            }\n" 
    "        }\n" 
    "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
    "        if( localId < gOutImageSizeSquared ) {\n" 
    "            for( int u = minu; u <= maxu; u++ ) {\n" 
    "                int inputRow = outputRow + u + ( gPadZeros ? 0 : gHalfFilterSize );\n" 
    "                int inputimagerowoffset = inputRow * gUpstreamImageSize;\n" 
    "                int filterrowoffset = (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;\n" 
    "                for( int v = minv; v <= maxv; v++ ) {\n" 
    "                    int inputCol = outputCol + v + ( gPadZeros ? 0 : gHalfFilterSize );\n" 
    "                    sum += _upstreamImage[ inputimagerowoffset + inputCol] * _filterCube[ filterrowoffset + v ];\n" 
    "                }\n" 
    "            }\n" 
    "        }\n" 
    "    }\n" 
    "    #ifdef BIASED\n" 
    "        sum += biases[outPlane];\n" 
    "    #endif\n" 
    "    // output are organized like [imageid][filterid][row][col]\n" 
    "    int resultIndex = ( n * gNumOutPlanes + outPlane ) * gOutImageSizeSquared + localId;\n" 
    "    if( localId < gOutImageSizeSquared ) {\n" 
    "        output[resultIndex ] = ACTIVATION_FUNCTION(sum);\n" 
    "//        output[resultIndex ] = 123;\n" 
    "    }\n" 
    "}\n" 
    "#endif\n" 
    "#endif\n" 
    "#endif\n" 
    "\n" 
    "\n" 
    "";
    kernel2 = cl->buildKernelFromString( kernel2Source, "reduce_rows", options, "cl/forward_fc.cl" );
    // [[[end]]]
}
Example #12
0
BackwardGpuCached::BackwardGpuCached(EasyCL *cl, LayerDimensions dim) :
        Backward(cl, dim)
            {
    if(square(dim.inputSize) > cl->getMaxWorkgroupSize()) {
        throw runtime_error("cannot use BackwardGpuCached, since inputSize * inputSize > maxworkgroupsize");
    }

    std::string options = dim.buildOptionsString();
    options += ""; // " -D " + upstreamFn->getDefineName();
    // [[[cog
    // import stringify
    // stringify.write_kernel2("kernel", "cl/backward_cached.cl", "calcGradInputCached", 'options')
    // # stringify.write_kernel2("broadcastMultiply", "cl/backproperrorsv2.cl", "broadcast_multiply", 'options')
    // # stringify.write_kernel2("applyActivationDeriv", "cl/applyActivationDeriv.cl", "applyActivationDeriv", 'options')
    // # stringify.write_kernel("kernelSource", "ClConvolve.cl")
    // ]]]
    // generated using cog, from cl/backward_cached.cl:
    const char * kernelSource =  
    "// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n"
    "//\n"
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
    "// obtain one at http://mozilla.org/MPL/2.0/.\n"
    "\n"
    "void copyLocal(local float *target, global float const *source, int N) {\n"
    "    int numLoops = (N + get_local_size(0) - 1) / get_local_size(0);\n"
    "    for (int loop = 0; loop < numLoops; loop++) {\n"
    "        int offset = loop * get_local_size(0) + get_local_id(0);\n"
    "        if (offset < N) {\n"
    "            target[offset] = source[offset];\n"
    "        }\n"
    "    }\n"
    "}\n"
    "\n"
    "// as calcGradInput, but with local cache\n"
    "// convolve weights with gradOutput to produce gradInput\n"
    "// workgroupid: [n][inputPlane]\n"
    "// localid: [upstreamrow][upstreamcol]\n"
    "// per-thread aggregation: [outPlane][filterRow][filterCol]\n"
    "// need to store locally:\n"
    "// - _gradOutputPlane. size = outputSizeSquared\n"
    "// - _filterPlane. size = filtersizesquared\n"
    "// note: currently doesnt use bias as input.  thats probably an error?\n"
    "// inputs: gradOutput :convolve: filters => gradInput\n"
    "//\n"
    "// global:\n"
    "// gradOutput: [n][outPlane][outRow][outCol] 128 * 32 * 19 * 19 * 4\n"
    "// weights: [filterId][upstreamplane][filterRow][filterCol] 32 * 32 * 5 * 5 * 4\n"
    "// per workgroup:\n"
    "// gradOutput: [outPlane][outRow][outCol] 32 * 19 * 19 * 4 = 46KB\n"
    "// weights: [filterId][filterRow][filterCol] 32 * 5 * 5 * 4 = 3.2KB\n"
    "// gradOutputforupstream: [n][upstreamPlane][upstreamRow][upstreamCol]\n"
    "void kernel calcGradInputCached(\n"
    "        const int batchSize,\n"
    "        global const float *gradOutputGlobal,\n"
    "        global const float *filtersGlobal,\n"
    "        global float *gradInput,\n"
    "        local float *_gradOutputPlane,\n"
    "        local float *_filterPlane) {\n"
    "\n"
    "    #define globalId get_global_id(0)\n"
    "    #define localId get_local_id(0)\n"
    "    #define workgroupId get_group_id(0)\n"
    "    #define workgroupSize get_local_size(0)\n"
    "\n"
    "    const int n = workgroupId / gInputPlanes;\n"
    "    const int upstreamPlane = workgroupId % gInputPlanes;\n"
    "\n"
    "    const int upstreamRow = localId / gInputSize;\n"
    "    const int upstreamCol = localId % gInputSize;\n"
    "\n"
    "    float sumWeightTimesOutError = 0;\n"
    "    for (int outPlane = 0; outPlane < gNumFilters; outPlane++) {\n"
    "        barrier(CLK_LOCAL_MEM_FENCE);\n"
    "        copyLocal(_filterPlane, filtersGlobal + (outPlane * gInputPlanes + upstreamPlane) * gFilterSizeSquared, gFilterSizeSquared);\n"
    "        copyLocal(_gradOutputPlane, gradOutputGlobal + (n * gNumFilters + outPlane) * gOutputSizeSquared, gOutputSizeSquared);\n"
    "        barrier(CLK_LOCAL_MEM_FENCE);\n"
    "        for (int filterRow = 0; filterRow < gFilterSize; filterRow++) {\n"
    "            int outRow = upstreamRow + gMargin - filterRow;\n"
    "            for (int filterCol = 0; filterCol < gFilterSize; filterCol++) {\n"
    "                int outCol = upstreamCol + gMargin - filterCol;\n"
    "                if (outCol >= 0 && outCol < gOutputSize && outRow >= 0 && outRow < gOutputSize) {\n"
    "                    float thisWeightTimesError =\n"
    "                        _gradOutputPlane[outRow * gOutputSize + outCol] *\n"
    "                        _filterPlane[filterRow * gFilterSize + filterCol];\n"
    "                    sumWeightTimesOutError += thisWeightTimesError;\n"
    "                }\n"
    "            }\n"
    "        }\n"
    "    }\n"
    "    const int upstreamImageGlobalOffset = (n * gInputPlanes + upstreamPlane) * gInputSizeSquared;\n"
    "    if (localId < gInputSizeSquared) {\n"
    "        gradInput[upstreamImageGlobalOffset + localId] = sumWeightTimesOutError;\n"
    "    }\n"
    "}\n"
    "\n"
    "";
    kernel = cl->buildKernelFromString(kernelSource, "calcGradInputCached", options, "cl/backward_cached.cl");
    // [[[end]]]
//    kernel = cl->buildKernel("backproperrorsv2.cl", "calcGradInput", options);
//    kernel = cl->buildKernelFromString(kernelSource, "calcGradInput", options);
}
Example #13
0
Forward3::Forward3( EasyCL *cl, LayerDimensions dim ) :
        Forward( cl, dim )
            {

    addBias = new AddBias( cl );

    if( square( dim.outputImageSize ) > cl->getMaxWorkgroupSize() ) {
        throw runtime_error("cannot use forward3, since outputimagesize * outputimagesize > maxworkgroupsize");
    }

    std::string options = ""; // "-D " + fn->getDefineName();
    options += dim.buildOptionsString();

    // [[[cog
    // import stringify
    // stringify.write_kernel2( "kernel", "cl/forward3.cl", "forward_3_by_n_outplane", 'options' )
    // # stringify.write_kernel2( "repeatedAdd", "cl/per_element_add.cl", "repeated_add", 'options' )
    // ]]]
    // generated using cog, from cl/forward3.cl:
    const char * kernelSource =  
    "// Copyright Hugh Perkins 2014, 2015 hughperkins at gmail\n" 
    "//\n" 
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" 
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" 
    "// obtain one at http://mozilla.org/MPL/2.0/.\n" 
    "\n" 
    "// concept: each workgroup handles convolving one input example with one filtercube\n" 
    "// and writing out one single output plane\n" 
    "//\n" 
    "// workgroup id organized like: [imageid][outplane]\n" 
    "// local id organized like: [outrow][outcol]\n" 
    "// each thread iterates over: [upstreamplane][filterrow][filtercol]\n" 
    "// number workgroups = 32\n" 
    "// one filter plane takes up 5 * 5 * 4 = 100 bytes\n" 
    "// one filter cube (corresponding to one outplane) = 5*5 * 32 * 4 = 3.2KB (ok)\n" 
    "// all filter cubes = 3.2KB * 32 = 102KB (too big)\n" 
    "// output are organized like [imageid][filterid][row][col]\n" 
    "void kernel forward_3_by_n_outplane( const int batchSize,\n" 
    "      global const float *images, global const float *filters,\n" 
    "    global float *output,\n" 
    "    local float *_upstreamImage, local float *_filterCube ) {\n" 
    "    const int globalId = get_global_id(0);\n" 
    "\n" 
    "    const int workgroupId = get_group_id(0);\n" 
    "    const int workgroupSize = get_local_size(0);\n" 
    "    const int n = workgroupId / gNumFilters;\n" 
    "    const int outPlane = workgroupId % gNumFilters;\n" 
    "\n" 
    "    const int localId = get_local_id(0);\n" 
    "    const int outputRow = localId / gOutputImageSize;\n" 
    "    const int outputCol = localId % gOutputImageSize;\n" 
    "\n" 
    "    const int minu = gPadZeros ? max( -gHalfFilterSize, -outputRow ) : -gHalfFilterSize;\n" 
    "    const int maxu = gPadZeros ? min( gHalfFilterSize - gEven, gOutputImageSize - 1 - outputRow  - gEven) : gHalfFilterSize - gEven;\n" 
    "    const int minv = gPadZeros ? max( -gHalfFilterSize, -outputCol ) : - gHalfFilterSize;\n" 
    "    const int maxv = gPadZeros ? min( gHalfFilterSize - gEven, gOutputImageSize - 1 - outputCol - gEven) : gHalfFilterSize - gEven;\n" 
    "\n" 
    "    const int numUpstreamsPerThread = ( gInputImageSizeSquared + workgroupSize - 1 ) / workgroupSize;\n" 
    "\n" 
    "    const int filterCubeLength = gInputPlanes * gFilterSizeSquared;\n" 
    "    const int filterCubeGlobalOffset = outPlane * filterCubeLength;\n" 
    "    const int numPixelsPerThread = ( filterCubeLength + workgroupSize - 1 ) / workgroupSize;\n" 
    "    for( int i = 0; i < numPixelsPerThread; i++ ) {\n" 
    "        int thisOffset = localId + i * workgroupSize;\n" 
    "        if( thisOffset < filterCubeLength ) {\n" 
    "            _filterCube[thisOffset] = filters[filterCubeGlobalOffset + thisOffset];\n" 
    "        }\n" 
    "    }\n" 
    "    // dont need a barrier, since we'll just run behind the barrier from the upstream image download\n" 
    "\n" 
    "    float sum = 0;\n" 
    "    for( int upstreamPlane = 0; upstreamPlane < gInputPlanes; upstreamPlane++ ) {\n" 
    "        int thisUpstreamImageOffset = ( n * gInputPlanes + upstreamPlane ) * gInputImageSizeSquared;\n" 
    "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
    "        for( int i = 0; i < numUpstreamsPerThread; i++ ) {\n" 
    "            int thisOffset = workgroupSize * i + localId;\n" 
    "            if( thisOffset < gInputImageSizeSquared ) {\n" 
    "                _upstreamImage[ thisOffset ] = images[ thisUpstreamImageOffset + thisOffset ];\n" 
    "            }\n" 
    "        }\n" 
    "        barrier(CLK_LOCAL_MEM_FENCE);\n" 
    "        int filterImageOffset = upstreamPlane * gFilterSizeSquared;\n" 
    "        for( int u = minu; u <= maxu; u++ ) {\n" 
    "            int inputRow = outputRow + u;\n" 
    "            #if gPadZeros == 0\n" 
    "                inputRow += gHalfFilterSize;\n" 
    "            #endif\n" 
    "            int inputimagerowoffset = inputRow * gInputImageSize;\n" 
    "            int filterrowoffset = filterImageOffset + (u+gHalfFilterSize) * gFilterSize + gHalfFilterSize;\n" 
    "            for( int v = minv; v <= maxv; v++ ) {\n" 
    "                int inputCol = outputCol + v;\n" 
    "                #if gPadZeros == 0\n" 
    "                    inputCol += gHalfFilterSize;\n" 
    "                #endif\n" 
    "                if( localId < gOutputImageSizeSquared ) {\n" 
    "                    sum += _upstreamImage[ inputimagerowoffset + inputCol] * _filterCube[ filterrowoffset + v ];\n" 
    "                }\n" 
    "            }\n" 
    "        }\n" 
    "    }\n" 
    "\n" 
    "    // output are organized like [imageid][filterid][row][col]\n" 
    "    int resultIndex = ( n * gNumFilters + outPlane ) * gOutputImageSizeSquared + localId;\n" 
    "    if( localId < gOutputImageSizeSquared ) {\n" 
    "        output[resultIndex ] = sum;\n" 
    "    }\n" 
    "}\n" 
    "\n" 
    "";
    kernel = cl->buildKernelFromString( kernelSource, "forward_3_by_n_outplane", options, "cl/forward3.cl" );
    // [[[end]]]
}