Пример #1
0
VIRTUAL void PoolingForward::forward(int batchSize, float *input, int *selectors, float *output) {
//    cout << "PoolingForward::forward(float *)" << endl;
    CLWrapper *inputWrapper = cl->wrap(getInputNumElements(batchSize), input);
    CLWrapper *selectorsWrapper = cl->wrap(getOutputNumElements(batchSize), selectors);
    CLWrapper *outputWrapper = cl->wrap(getOutputNumElements(batchSize), output);

    inputWrapper->copyToDevice();
    forward(batchSize, inputWrapper, selectorsWrapper, outputWrapper);
    selectorsWrapper->copyToHost();    
    outputWrapper->copyToHost();    

    delete outputWrapper;
    delete selectorsWrapper;
    delete inputWrapper;
}
Пример #2
0
TEST( SLOW_testintwrapper_huge, testreadwrite ) {

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_stress");
    const int N = 1000000;
    int *in = new int[N];
    for( int i = 0; i < N; i++ ) {
        in[i] = i * 3;
    }
    int *out = new int[N];
    CLWrapper *inwrapper = cl->wrap(N, in);
    CLWrapper *outwrapper = cl->wrap(N, out);
    inwrapper->copyToDevice();
    outwrapper->createOnDevice();
    kernel->input( inwrapper );
    kernel->output( outwrapper );
    int globalSize = N;
    int workgroupsize = cl->getMaxWorkgroupSize();
    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
    cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl;
    kernel->run_1d( globalSize, workgroupsize );
    outwrapper->copyToHost();
    for( int i = 0; i < N; i++ ) {
       if( out[i] != 689514 ) {
           cout << "out[" << i << "] != 689514: " << out[i] << endl;
           exit(-1);
       }
    }

    delete outwrapper;
    delete inwrapper;
    delete kernel;
    delete cl;
}
Пример #3
0
TEST( SLOW_testintwrapper_huge, testread ) {
    Timer timer;
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_read");
//    const int N = 4500000;
//    const int N = (4500000/512)*512;
    int N = 100000;
    int *out = new int[N];
    CLWrapper *outwrapper = cl->wrap(N, out);
    kernel->in(3)->in(7);
    kernel->output( outwrapper );
    int globalSize = N;
    int workgroupsize = cl->getMaxWorkgroupSize();
    globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize;
    cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl;
    timer.timeCheck("before kernel");
    kernel->run_1d( globalSize, workgroupsize );
    timer.timeCheck("after kernel");
    outwrapper->copyToHost();
    timer.timeCheck("after copy to host");
    for( int i = 0; i < N; i++ ) {
       if( out[i] != 4228 ) {
           cout << "out[" << i << "] != 4228: " << out[i] << endl;
           exit(-1);
       }
    }

    delete outwrapper;
    delete kernel;
    delete cl;
}
Пример #4
0
void CopyBuffer::copy( EasyCL *cl, CLWrapper *sourceWrapper, int *target ) {
    // first we will copy it to another buffer, so we can copy it out
    int bufferSize = sourceWrapper->size();
//    float *copiedBuffer = new float[ bufferSize ];
    CLWrapper *targetWrapper = cl->wrap( bufferSize, target );
    targetWrapper->createOnDevice();

    // now copy it, via a kernel
    const string kernelSource = "\n"
        "kernel void copy( int N, global int const *source, global int *dest ) {\n"
          "  #define globalId ( get_global_id(0) )\n"
          "  if( (int)globalId < N ) {\n"
          "      dest[globalId] = source[globalId];\n"
          "  }\n"
       " }\n";
    CLKernel *kernel = cl->buildKernelFromString( kernelSource, "copy", "" );
    kernel->in( bufferSize )->in( sourceWrapper )->out( targetWrapper );
    int workgroupSize = 32;
    int numWorkgroups = ( bufferSize + workgroupSize - 1 ) / workgroupSize;
    kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize );
    cl->finish();
    targetWrapper->copyToHost();

    delete targetWrapper;
    delete kernel;
//    delete[] copiedBuffer;
}
Пример #5
0
void forwardWithWipe( Forward *prop, int batchSize, LayerDimensions dim, float *inputData, float *filters, float *biases, float *output ) {
    int inputDataSize = batchSize * dim.inputCubeSize;
    CLWrapper *dataWrapper = prop->cl->wrap( inputDataSize, inputData );
    dataWrapper->copyToDevice();

    int weightsSize = dim.filtersSize;
    CLWrapper *weightsWrapper = prop->cl->wrap( weightsSize, filters );
    weightsWrapper->copyToDevice();

    CLWrapper *biasWrapper = 0;
    if( dim.biased ) {
        biasWrapper = prop->cl->wrap( dim.numFilters, biases );
        biasWrapper->copyToDevice();
    }

    CLWrapper *outputWrapper = prop->cl->wrap( batchSize * dim.outputCubeSize, output );
    memset( output, 99, sizeof(float) * batchSize * dim.outputCubeSize );
    outputWrapper->copyToDevice(); // so we can wipe it...

    StatefulTimer::timeCheck("testforward: after data wrapper processing");
    prop->forward( batchSize, dataWrapper, weightsWrapper, biasWrapper,
            outputWrapper );
//    StatefulTimer::timeCheck("Forward::forward after call forward");
    outputWrapper->copyToHost();
//    StatefulTimer::timeCheck("Forward::forward after copytohost");
    delete outputWrapper;

    delete dataWrapper;
    delete weightsWrapper;
    if( dim.biased ) {
        delete biasWrapper;
    }
}
Пример #6
0
void propagateWithWipe( Propagate *prop, int batchSize, LayerDimensions dim, float *inputData, float *filters, float *biases, float *results ) {
    int inputDataSize = batchSize * dim.inputCubeSize;
    CLWrapper *dataWrapper = prop->cl->wrap( inputDataSize, inputData );
    dataWrapper->copyToDevice();

    int weightsSize = dim.filtersSize;
    CLWrapper *weightsWrapper = prop->cl->wrap( weightsSize, filters );
    weightsWrapper->copyToDevice();

    CLWrapper *biasWeightsWrapper = 0;
    if( dim.biased ) {
        biasWeightsWrapper = prop->cl->wrap( dim.numFilters, biases );
        biasWeightsWrapper->copyToDevice();
    }

    CLWrapper *resultsWrapper = prop->cl->wrap( batchSize * dim.outputCubeSize, results );
    memset( results, 99, sizeof(float) * batchSize * dim.outputCubeSize );
    resultsWrapper->copyToDevice(); // so we can wipe it...

    StatefulTimer::timeCheck("testpropagate: after data wrapper processing");
    prop->propagate( batchSize, dataWrapper, weightsWrapper, biasWeightsWrapper,
            resultsWrapper );
//    StatefulTimer::timeCheck("Propagate::propagate after call propagate");
    resultsWrapper->copyToHost();
//    StatefulTimer::timeCheck("Propagate::propagate after copytohost");
    delete resultsWrapper;

    delete dataWrapper;
    delete weightsWrapper;
    if( dim.biased ) {
        delete biasWeightsWrapper;
    }
}
Пример #7
0
TEST(testfloatwrapperconst, main) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernelFromString(getKernel(), "test", "");
    float in[5];
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3;
    }
    float out[5];
    CLWrapper *inwrapper = cl->wrap(5, (float const *)in);
    CLWrapper *outwrapper = cl->wrap(5, out);
    inwrapper->copyToDevice();
    kernel->input(inwrapper);
    kernel->output(outwrapper);
    kernel->run_1d(5, 5);
    outwrapper->copyToHost();
    assertEquals(out[0] , 7);
    assertEquals(out[1] , 10);
    assertEquals(out[2] , 13);
    assertEquals(out[3] , 16);
    assertEquals(out[4] , 19);
    cout << "tests completed ok" << endl;

    delete inwrapper;
    delete outwrapper;
    delete kernel;
    delete cl;
}
Пример #8
0
TEST( testpoolingforward, fromwrappers ) {
    int batchSize = 1;
    int numPlanes = 1;
    int imageSize = 4;
    int poolingSize = 2;
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    PoolingForward *poolingForward = PoolingForward::instanceSpecific( 1, cl, false, numPlanes, imageSize, poolingSize );
    float input[] = { 1, 2, 5, 3,
                     3, 8, 4, 1,
                     3, 33, 14,23,
                     -1, -3.5f,37.4f,5
    };
    int outputSize = poolingForward->getOutputSize( batchSize );
    int *selectors = new int[outputSize];
    float *output = new float[outputSize];

    const int inputSize = batchSize * numPlanes * imageSize * imageSize;
    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
    CLWrapper *selectorsWrapper = cl->wrap( outputSize, selectors );
    CLWrapper *outputWrapper = cl->wrap( outputSize, output );

    inputWrapper->copyToDevice();

    poolingForward->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper );

    selectorsWrapper->copyToHost();
    outputWrapper->copyToHost();

    EXPECT_EQ( selectors[0], 3 );
    EXPECT_EQ( selectors[1], 0 );
    EXPECT_EQ( selectors[2], 1 );
    EXPECT_EQ( selectors[3], 2 );

    EXPECT_EQ( output[0], 8 );
    EXPECT_EQ( output[1], 5 );
    EXPECT_EQ( output[2], 33 );
    EXPECT_EQ( output[3], 37.4f );

    delete inputWrapper;
    delete selectorsWrapper;
    delete outputWrapper;
    delete poolingForward;
    delete[] selectors;
    delete[] output;
    delete cl;
}
Пример #9
0
TEST(testcopybuffer, withoffset) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test");
    float in[10];
    float in2[10];
    for(int i = 0; i < 10; i++) {
        in[i] = i * 3;
        in2[i] = 23 + i;
    }
    float out[10];
    CLWrapper *inwrapper = cl->wrap(10, in);
    CLWrapper *in2wrapper = cl->wrap(10, in2);
    CLWrapper *outwrapper = cl->wrap(10, out);
    inwrapper->copyToDevice();
    in2wrapper->copyToDevice();

    EXPECT_FALSE(in2wrapper->isDeviceDirty());
    inwrapper->copyTo(in2wrapper, 2, 5, 4);
    EXPECT_TRUE(in2wrapper->isDeviceDirty());
  //  cl->finish();
    // check that in2 host-side unchanged:
    for(int i = 0; i < 10; i++) {
        in[i] = i * 3;
        EXPECT_EQ(23 + i, in2[i]);
    }

    in2wrapper->copyToHost();
    // check that in2 is now a partial copy of in:
    for(int i = 0; i < 10; i++) {
//        in[i] = i * 3;
        if(i >= 5 && i < 9) {
          EXPECT_EQ((i-3) * 3, in2[i]);
        } else {
          EXPECT_EQ(23 + i, in2[i]);
        }
    }

    // check that modifying in2 doesnt modfiy in:
    in2[1] = 27;
    in2wrapper->copyToDevice();
    inwrapper->copyToHost();
    EXPECT_EQ(1 * 3, in[1]);

    in2wrapper->copyToHost();
    EXPECT_EQ(1 * 3, in[1]);
    EXPECT_EQ(27, in2[1]);
    
    delete inwrapper;
    delete in2wrapper;
    delete outwrapper;
    delete cl;
}
Пример #10
0
VIRTUAL void BackpropWeights2::backpropWeights( int batchSize, float learningRate, float *derivLossBySum, float *inputData, float *filters, float *biasWeights ) {
    StatefulTimer::timeCheck("BackpropWeights2::backprop begin");

//    const float learningMultiplier = learningRate / batchSize / sqrt( dim.outputImageSize * dim.outputImageSize );

    int resultsSize = batchSize * dim.outputCubeSize;
    CLWrapper *derivLossBySumWrapper = cl->wrap( resultsSize, derivLossBySum );
    derivLossBySumWrapper->copyToDevice();

    int inputSize = batchSize * dim.inputCubeSize;
    CLWrapper *inputDataWrapper = cl->wrap( inputSize, inputData );
    inputDataWrapper->copyToDevice();

    CLWrapper *weightsWrapper = 0;
    int weightsSize = debug ? std::max(10000, dim.filtersSize ) : dim.filtersSize;
    weightsWrapper = cl->wrap( weightsSize, filters );
    weightsWrapper->copyToDevice();

//    cout << "backpropweights2::backpropweights resultsSize=" << resultsSize << " inputSize=" << inputSize << 
//        " weightSize=" << weightsSize << endl;

    CLWrapper *biasWeightsWrapper = 0;
    if( dim.biased ) {
        biasWeightsWrapper = cl->wrap( dim.numFilters, biasWeights );
        biasWeightsWrapper->copyToDevice();
    }

    StatefulTimer::timeCheck("BackpropWeights2::backprop after copied to device");
    backpropWeights( batchSize, learningRate, derivLossBySumWrapper, inputDataWrapper, weightsWrapper, biasWeightsWrapper );
    StatefulTimer::timeCheck("BackpropWeights2::backprop after call backprop");
    weightsWrapper->copyToHost();
    if( dim.biased ) {
        biasWeightsWrapper->copyToHost();
    }
    StatefulTimer::timeCheck("BackpropWeights2::backprop after copytohost");

    delete derivLossBySumWrapper;
    delete inputDataWrapper;
    delete weightsWrapper;
    if( dim.biased ) {
        delete biasWeightsWrapper;
    }
}
Пример #11
0
TEST(testcopybuffer, main) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test");
    float in[5];
    float in2[5];
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3.0f;
        in2[i] = 23.0f + i;
    }
    float out[5];
    CLWrapper *inwrapper = cl->wrap(5, in);
    CLWrapper *in2wrapper = cl->wrap(5, in2);
    CLWrapper *outwrapper = cl->wrap(5, out);
    inwrapper->copyToDevice();
    in2wrapper->copyToDevice();

    EXPECT_FALSE(in2wrapper->isDeviceDirty());
    inwrapper->copyTo(in2wrapper);
    EXPECT_TRUE(in2wrapper->isDeviceDirty());
  //  cl->finish();
    // check that in2 host-side unchanged:
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3.0f;
        EXPECT_EQ(23.0f + i, in2[i]);
    }

    in2wrapper->copyToHost();
    // check that in2 is now a copy of in:
    for(int i = 0; i < 5; i++) {
        in[i] = i * 3.0f;
        EXPECT_EQ(i * 3.0f, in2[i]);
    }

    // check that modifying in2 doesnt modfiy in:
    in2[1] = 27;
    in2wrapper->copyToDevice();
    inwrapper->copyToHost();
    EXPECT_EQ(1 * 3.0f, in[1]);

    in2wrapper->copyToHost();
    EXPECT_EQ(1 * 3.0f, in[1]);
    EXPECT_EQ(27.0f, in2[1]);
    
    delete inwrapper;
    delete in2wrapper;
    delete outwrapper;
    delete cl;
}
Пример #12
0
VIRTUAL void PoolingBackprop::backpropErrors( int batchSize, float *errors, int *selectors, float *errorsForUpstream ) {
//    cout << "PoolingBackprop::backpropErrors( float * )" << endl;
    StatefulTimer::instance()->timeCheck("PoolingBackprop::backpropErrors float->wrapper start" );
    CLWrapper *errorsWrapper = cl->wrap( getResultsSize(batchSize), errors );
    CLWrapper *selectorsWrapper = cl->wrap( getResultsSize(batchSize), selectors );
    CLWrapper *errorsForUpstreamWrapper = cl->wrap( getInputSize(batchSize), errorsForUpstream );

    errorsWrapper->copyToDevice();
    selectorsWrapper->copyToDevice();

    backpropErrors( batchSize, errorsWrapper, selectorsWrapper, errorsForUpstreamWrapper );

    selectorsWrapper->copyToHost();
    errorsForUpstreamWrapper->copyToHost();

    delete errorsWrapper;
    delete selectorsWrapper;
    delete errorsForUpstreamWrapper;
    StatefulTimer::instance()->timeCheck("PoolingBackprop::backpropErrors float->wrapper end" );
}
Пример #13
0
VIRTUAL void ActivationPropagate::propagate( int batchSize, float *input, float *output ) {
//    cout << "ActivationPropagate::propagate( float * )" << endl;
    CLWrapper *inputWrapper = cl->wrap( getInputSize( batchSize ), input );
    CLWrapper *outputWrapper = cl->wrap( getResultsSize( batchSize ), output );

    inputWrapper->copyToDevice();
    propagate( batchSize, inputWrapper, outputWrapper );
    outputWrapper->copyToHost();    

    delete outputWrapper;
    delete inputWrapper;
}
Пример #14
0
TEST(SLOW_testcopybuffer, larger) {
    if(!EasyCL::isOpenCLAvailable()) {
        cout << "opencl library not found" << endl;
        exit(-1);
    }
    cout << "found opencl library" << endl;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    //CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test");
    const int bufferSize = 100 * 1024 * 1024 / 4; // 100MB (of floats)
    float *in = new float[bufferSize];
    float *in2 = new float[bufferSize];
    for(int i = 0; i < bufferSize; i++) {
        in[i] = i * 3;
        in2[i] = 23 + i;
    }
    CLWrapper *inwrapper = cl->wrap(bufferSize, in);
    CLWrapper *in2wrapper = cl->wrap(bufferSize, in2);
    inwrapper->copyToDevice();
    in2wrapper->copyToDevice();

    inwrapper->copyTo(in2wrapper);
//    cl->finish();
    // check that in2 host-side unchanged:
    for(int i = 0; i < bufferSize; i++) {
        in[i] = i * 3;
        EXPECT_EQ(23 + i, in2[i]);
    }

    in2wrapper->copyToHost();
    // check that in2 is now a copy of in:
    for(int i = 0; i < bufferSize; i++) {
        in[i] = i * 3;
        EXPECT_EQ(i * 3, in2[i]);
    }

    // check that modifying in2 doesnt modfiy in:
    in2[1] = 27;
    in2wrapper->copyToDevice();
    inwrapper->copyToHost();
    EXPECT_EQ(1 * 3, in[1]);

    in2wrapper->copyToHost();
    EXPECT_EQ(1 * 3, in[1]);
    EXPECT_EQ(27, in2[1]);
    
    delete inwrapper;
    delete in2wrapper;
    delete[] in;
    delete[] in2;
    delete cl;
}
Пример #15
0
VIRTUAL void ConvolutionalLayer::backProp( float learningRate ) {
//        Timer timer;
    StatefulTimer::instance()->timeCheck("backprop(): start, layer " + toString( layerIndex ) );

    CLWrapper *biasWeightsWrapper = 0;
    if( dim.biased ) {
        biasWeightsWrapper = cl->wrap( getBiasWeightsSize(), biasWeights );
        biasWeightsWrapper->copyToDevice();
    }

    CLWrapper *imagesWrapper = 0;
    if( previousLayer->hasResultsWrapper() ) {
        imagesWrapper = previousLayer->getResultsWrapper();
    } else {
        imagesWrapper = cl->wrap( previousLayer->getResultsSize(), previousLayer->getResults() );
        imagesWrapper->copyToDevice();
    }

    CLWrapper *errorsWrapper = 0;
    bool weOwnErrorsWrapper = false;
    if( nextLayer->providesErrorsForUpstreamWrapper() ) {
        errorsWrapper = nextLayer->getErrorsForUpstreamWrapper();
    } else {
        errorsWrapper = cl->wrap( getResultsSize(), nextLayer->getErrorsForUpstream() );
        errorsWrapper->copyToDevice();
//        int resultsSize = getResultsSize();
//        for( int i = 0; i < resultsSize; i++ ) {
//            cout << "convolutional::backproperrors errorsfromupstream[" << i << "]=" << nextLayer->getErrorsForUpstream()[i] << endl;
//        }
        weOwnErrorsWrapper = true;
    }
    if( previousLayer->needsBackProp() ) {
        backpropErrorsImpl->backpropErrors( batchSize, imagesWrapper, errorsWrapper, weightsWrapper, errorsForUpstreamWrapper );
        StatefulTimer::instance()->timeCheck("backproperrors(): calced errors for upstream, layer " + ::toString( layerIndex ) );
    }

    backpropWeightsImpl->backpropWeights( batchSize, learningRate, errorsWrapper, imagesWrapper,  weightsWrapper, biasWeightsWrapper );
    weightsCopiedToHost = false;
    StatefulTimer::instance()->timeCheck("backproperrors(): done weight backprop, layer " + ::toString( layerIndex ) );

    if( dim.biased ) {
        biasWeightsWrapper->copyToHost();
        delete biasWeightsWrapper;
    }
    if( !previousLayer->hasResultsWrapper() ) {
        delete imagesWrapper;
    }
    if( weOwnErrorsWrapper ) {
        delete errorsWrapper;
    }
    StatefulTimer::instance()->timeCheck("backproperrors(): updated weights, layer " + ::toString( layerIndex ) );
}
Пример #16
0
TEST(testClBlas, colMajor2) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    float A[] = {1, 3,
                 2, 7,
                 9, 5,
                 0, -2};
    float B[] = {3,2,8,
                 -1,0,4};

    float C[4*3];
    transpose(A, 4, 2);
    transpose(B, 2, 3);
//    for(int row=0; row < 2; row++) {
//        for(int col=0; col < 1; col++) {
//            cout << B[row*1 + col] << " ";
//        }
//        cout << endl;
//    }
    ClBlasInstance clblasInstance;
//    ClBlasInstance::initializeIfNecessary();
    CLWrapper *AWrap = cl->wrap(4*2, A);
    CLWrapper *BWrap = cl->wrap(2*3, B);
    CLWrapper *CWrap = cl->wrap(4*3, C);
    AWrap->copyToDevice();
    BWrap->copyToDevice();
    ClBlasHelper::Gemm(
        cl,
        clblasColumnMajor,
        clblasNoTrans, clblasNoTrans,
        4, 2, 3,
        1,
        AWrap, 0,
        BWrap, 0,
        0,
        CWrap, 0
    );
//    cl->finish();
    CWrap->copyToHost();
    transpose(C, 3, 4);
    EXPECT_EQ(1*3-1*3, C[0]);
    EXPECT_EQ(1*2+3*0, C[1]);
    EXPECT_EQ(1*8+4*3, C[2]);
    EXPECT_EQ(-8, C[11]);

    delete CWrap;
    delete BWrap;
    delete AWrap;

    delete cl;
}
Пример #17
0
TEST(testClBlas, colMajorTransB) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    float A[] = {1, 3,
                 2, 7,
                 9, 5};
    float B[] = {3,
                 -1};

    float C[3];
    transpose(A, 3, 2);
//    transpose(B, 2, 1);
//    for(int row=0; row < 2; row++) {
//        for(int col=0; col < 1; col++) {
//            cout << B[row*1 + col] << " ";
//        }
//        cout << endl;
//    }
    ClBlasInstance clblasInstance;
//    ClBlasInstance::initializeIfNecessary();
    CLWrapper *AWrap = cl->wrap(6, A);
    CLWrapper *BWrap = cl->wrap(2, B);
    CLWrapper *CWrap = cl->wrap(3, C);
    AWrap->copyToDevice();
    BWrap->copyToDevice();
    ClBlasHelper::Gemm(
        cl,
        clblasColumnMajor,
        clblasNoTrans, clblasTrans,
        3, 2, 1,
        1,
        AWrap, 0,
        BWrap, 0,
        0,
        CWrap, 0
    );
//    cl->finish();
    CWrap->copyToHost();
    transpose(C, 1, 3);
    EXPECT_EQ(0, C[0]);
    EXPECT_EQ(-1, C[1]);
    EXPECT_EQ(22, C[2]);

    delete CWrap;
    delete BWrap;
    delete AWrap;

    delete cl;
}
Пример #18
0
TEST( testMemset, basic ) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    CLKernel *kMemset = 0;
    // [[[cog
    // import stringify
    // stringify.write_kernel2( "kMemset", "cl/memset.cl", "cl_memset", '""' )
    // ]]]
    // generated using cog, from cl/memset.cl:
    const char * kMemsetSource =  
    "// Copyright Hugh Perkins 2015 hughperkins at gmail\n"
    "//\n"
    "// This Source Code Form is subject to the terms of the Mozilla Public License,\n"
    "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n"
    "// obtain one at http://mozilla.org/MPL/2.0/.\n"
    "\n"
    "kernel void cl_memset(global float *target, const float value, const int N) {\n"
    "    #define globalId get_global_id(0)\n"
    "    if ((int)globalId < N) {\n"
    "        target[globalId] = value;\n"
    "    }\n"
    "}\n"
    "\n"
    "";
    kMemset = cl->buildKernelFromString(kMemsetSource, "cl_memset", "", "cl/memset.cl");
    // [[[end]]]

    int N = 10000;
    float *myArray = new float[N];
    CLWrapper *myArrayWrapper = cl->wrap( N, myArray );
    myArrayWrapper->createOnDevice();
    kMemset->out( myArrayWrapper )->in( 99.0f )->in( N );
    int workgroupSize = 64;
    kMemset->run_1d( ( N + workgroupSize - 1 ) / workgroupSize * workgroupSize, workgroupSize );
    cl->finish();
    myArrayWrapper->copyToHost();
    for( int i = 0; i < 10; i++ ) {
//        cout << "myArray[" << i << "]=" << myArray[i] << endl;
    }
    for( int i = 0; i < N; i++ ) {
        EXPECT_EQ( 99.0f, myArray[i] );
    }

    delete kMemset;

    delete cl;
}
Пример #19
0
TEST(testClBlas, basic) {
    EasyCL *cl = DeepCLGtestGlobals_createEasyCL();

    float A[] = {1, 3,
                 2, 7,
                 9, 5};
    float B[] = {3,
                 -1};

    float C[3];
    ClBlasInstance clblasInstance;
    CLWrapper *AWrap = cl->wrap(6, A);
    CLWrapper *BWrap = cl->wrap(2, B);
    CLWrapper *CWrap = cl->wrap(3, C);
    AWrap->copyToDevice();
    BWrap->copyToDevice();
    CWrap->createOnDevice();
    ClBlasHelper::Gemm(
        cl,
        clblasRowMajor,
        clblasNoTrans, clblasNoTrans,
        3, 2, 1,
        1,
        AWrap, 0,
        BWrap, 0,
        0,
        CWrap, 0
    );
    cl->finish();
    CWrap->copyToHost();
    EXPECT_EQ(0, C[0]);
    EXPECT_EQ(-1, C[1]);
    EXPECT_EQ(22, C[2]);

    cl->finish();

    delete CWrap;
    delete BWrap;
    delete AWrap;

    cl->finish();

    delete cl;
    clblasTeardown();
}
Пример #20
0
// must allocate output yourself before the call
VIRTUAL void Forward::forward( int batchSize, float *inputData, float *filters, float *biases, float *output ) {
    StatefulTimer::timeCheck("Forward::forward begin");
    int inputDataSize = batchSize * dim.inputCubeSize;
    CLWrapper *dataWrapper = cl->wrap( inputDataSize, inputData );
    dataWrapper->copyToDevice();

    int weightsSize = dim.filtersSize;
    CLWrapper *weightsWrapper = cl->wrap( weightsSize, filters );
    weightsWrapper->copyToDevice();

    CLWrapper *biasWrapper = 0;
    if( dim.biased ) {
        int biasWrapperSize = dim.numFilters;
        biasWrapper = cl->wrap( biasWrapperSize, biases );
        biasWrapper->copyToDevice();
    }

//    int outputDataSize = batchSize * dim.outputCubeSize;
//    cout << " batchsize " << batchSize << " " << dim << endl;
//    int allocatedOutputSize = std::max(5000, outputDataSize );
//    int allocatedOutputSize = outputDataSize;
//    float *output = new float[allocatedOutputSize];
    CLWrapper *outputWrapper = cl->wrap( batchSize * dim.outputCubeSize, output );
    cl->finish();

    StatefulTimer::timeCheck("Forward::forward after copied to device");
    forward( batchSize, dataWrapper, weightsWrapper, biasWrapper,
            outputWrapper );
    StatefulTimer::timeCheck("Forward::forward after call forward");
    outputWrapper->copyToHost();
    StatefulTimer::timeCheck("Forward::forward after copytohost");
//    for( int i = 0; i < 20; i++ ) {
//        cout << "output[" << i << "]=" << output[i] << endl;
//    }
    delete outputWrapper;

    delete dataWrapper;
    delete weightsWrapper;
    if( dim.biased ) {
        delete biasWrapper;
    }

//    return output;
}
Пример #21
0
void compareSpecific( CompareSpecificArgs args ) {
    cout << "instance0: " << args._instance0 << endl;
    cout << "instance1: " << args._instance1 << endl;

    int batchSize = args._batchSize;
    int numPlanes = args._numPlanes;
    int imageSize = args._imageSize;
    int poolingSize = args._poolingSize;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    PoolingForward *poolingForward0 = PoolingForward::instanceSpecific( args._instance0, cl, args._padZeros, numPlanes, imageSize, poolingSize );
    PoolingForward *poolingForward1 = PoolingForward::instanceSpecific( args._instance1, cl, args._padZeros, numPlanes, imageSize, poolingSize );

    const int inputSize = batchSize * numPlanes * imageSize * imageSize;
    int outputSize = poolingForward0->getOutputSize( batchSize );

    float *input = new float[ inputSize ];
    int *selectors = new int[ outputSize ];
    float *output = new float[ outputSize ];

    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
    CLWrapper *selectorsWrapper = cl->wrap( outputSize, selectors );
    CLWrapper *outputWrapper = cl->wrap( outputSize, output );

    WeightRandomizer::randomize( input, inputSize, -0.1f, 0.1f );

    memset( selectors, 99, sizeof(int) * outputSize );
    memset( output, 99, sizeof(int) * outputSize );

    inputWrapper->copyToDevice();
    selectorsWrapper->copyToDevice();
    outputWrapper->copyToDevice();

    poolingForward0->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper );
    selectorsWrapper->copyToHost();
    outputWrapper->copyToHost();

    int *selectors0 = new int[ outputSize ];
    float *output0 = new float[ outputSize ];
    memcpy( selectors0, selectors, sizeof(int) * outputSize );
    memcpy( output0, output, sizeof(float) * outputSize );
    
    memset( selectors, 99, sizeof(int) * outputSize );
    memset( output, 99, sizeof(int) * outputSize );

    inputWrapper->copyToDevice();
    selectorsWrapper->copyToDevice();
    outputWrapper->copyToDevice();

    poolingForward1->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper );
    selectorsWrapper->copyToHost();
    outputWrapper->copyToHost();
    
    int numErrors = 0;
    for( int i = 0; i < outputSize; i++ ) {
        if( selectors[i] != selectors0[i] ) {
            cout << "ERROR: selectors[" << i << "] instance0:" << selectors0[i] << " != instance1:" << selectors[i] << endl;
            numErrors++;
        }
        if( output[i] != output0[i] ) {
            cout << "ERROR: output[" << i << "] instance0:" << output0[i] << " != instance1:" << output[i] << endl;
            numErrors++;
        }
        if( numErrors >= 10 ) {
            cout << "More than 10 errors. Skipping the rest :-)" << endl;
            break;
        }
    }
    EXPECT_EQ( 0, numErrors );
    if( numErrors > 0 ) {
        int num2dPlanes = inputSize / imageSize / imageSize;
        for( int plane = 0; plane < num2dPlanes; plane++ ) {
            cout << "2dplane " << plane << ":" << endl;
            for( int i = 0; i < imageSize; i++ ) {
                string line = "";
                for( int j = 0; j < imageSize; j++ ) {
                    line += toString( input[ plane * imageSize * imageSize + i * imageSize + j] ) + " ";
                }
                cout << line << endl;
            }
            cout << endl;
        }
    }

    delete inputWrapper;
    delete selectorsWrapper;
    delete outputWrapper;
    delete poolingForward0;
    delete poolingForward1;
    delete[] selectors0;
    delete[] output0;
    delete[] selectors;
    delete[] output;
    delete[] input;
    delete cl;
}
Пример #22
0
float THClBlas_dot(THClState *state, long n, 
    CLWrapper *xwrapper, long xoffset, long incx, 
    CLWrapper *ywrapper, long yoffset, long incy)
{
  StatefulTimer::timeCheck("THClBlas_dot START");
  if(n == 1)
  {
    incx = 1;
    incy = 1;
  }

  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
  {
    int i_n = (int)n;
    int i_incx = (int)incx;
    int i_incy = (int)incy;
    float result;

    cl_int err;

    EasyCL *cl = ywrapper->getCl();
    CLWrapper *resultWrapper = cl->wrap( 1, &result );
    float *scratch = new float[i_n];
    CLWrapper *scratchWrapper = cl->wrap(i_n, scratch);
    scratchWrapper->createOnDevice();
    resultWrapper->createOnDevice();

    cl_event *event = 0;
//    if(state->addFinish) {
      event = new cl_event();
//    }
    err = clblasSdot( i_n, resultWrapper->getBuffer(), 0, 
          xwrapper->getBuffer(), xoffset, i_incx, 
          ywrapper->getBuffer(), yoffset, i_incy, 
          scratchWrapper->getBuffer(),
          1, cl->queue, 0, NULL, event);
    if (err != CL_SUCCESS) {
        THError("clblasSdot() failed with %d", err);
    }
    else {
//      if(state->addFinish) {
        err = clWaitForEvents(1, event);
        if (err != CL_SUCCESS) {
//              throw runtime_error("clblasSger() failed with " + easycl::toString(err));
          THError("clblasSger: wait for event failed with %d", err);
        }
        clReleaseEvent(*event);
        delete event;
//      }
    }
    // TODO: NOT copy to host here, use pointtensor
    resultWrapper->copyToHost();
    //resultWrapper->markDeviceDirty();

    delete resultWrapper;
    delete scratchWrapper;
    delete[] scratch;
    StatefulTimer::timeCheck("THClBlas_dot END");
    return result;
  }
  THError("Cublas_dot only supports n, incx and incy "
          "upto signed integer limits: %d", INT_MAX);
    THError("Not implemented");
  return -1;
}
Пример #23
0
float THClBlas_dot(THClState *state, long n, 
    CLWrapper *xwrapper, long xoffset, long incx, 
    CLWrapper *ywrapper, long yoffset, long incy)
{
  if(n == 1)
  {
    incx = 1;
    incy = 1;
  }

  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
  {
    int i_n = (int)n;
    int i_incx = (int)incx;
    int i_incy = (int)incy;
    float result;

    cl_int err;

    err = clblasSetup();
    if (err != CL_SUCCESS) {
        THError("clblasSetup() failed with %d", err);
    }
    
    CLWrapper *resultWrapper = state->cl->wrap( 1, &result );
    float *scratch = new float[i_n];
    CLWrapper *scratchWrapper = state->cl->wrap(i_n, scratch);
    scratchWrapper->createOnDevice();
    resultWrapper->createOnDevice();

    cl_event event = NULL;
    err = clblasSdot( i_n, resultWrapper->getBuffer(), 0, 
          xwrapper->getBuffer(), xoffset, i_incx, 
          ywrapper->getBuffer(), yoffset, i_incy, 
          scratchWrapper->getBuffer(),
          1, state->cl->queue, 0, NULL, &event);
//    THCublasCheck(cublasSdot(*state->blasState->current_handle, i_n, x, i_incx, y, i_incy, &result));
//    ClDeviceSynchronize();
    if (err != CL_SUCCESS) {
        THError("clblasSdot() failed with %d", err);
    }
    else {
        /* Wait for calculations to be finished. */
        err = clWaitForEvents(1, &event);
    }
    resultWrapper->copyToHost();

    /* Finalize work with clblas. */
    clblasTeardown();


    delete resultWrapper;
    delete scratchWrapper;
    delete[] scratch;
    return result;
  }
  THError("Cublas_dot only supports n, incx and incy "
          "upto signed integer limits: %d", INT_MAX);
    THError("Not implemented");
  return -1;
}