Пример #1
0
TEST( testCopyBuffer, nits ) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    const int N = 10;
    int *a = new int[N];
    for( int i = 0; i < N; i++ ) {
        a[i] = 3 + i;
    }
    CLWrapper *aWrapper = cl->wrap( N, a );
    aWrapper->copyToDevice();
    memset( a, 0, sizeof(int) * N );

    int *b = new int[N];
    CopyBuffer::copy( cl, aWrapper, b );
    
    for( int i = 0; i < N; i++ ) {
        cout << b[i] << endl;
        EXPECT_EQ( 3 + i, b[i] );
    }

    memset( b, 0, sizeof(int) * N );
    CopyBuffer::copy( cl, aWrapper, b );
    
    for( int i = 0; i < N; i++ ) {
        cout << b[i] << endl;
        EXPECT_EQ( 3 + i, b[i] );
    }

    PrintBuffer::printInts( cl, aWrapper, 10, 1 );
    
    delete[] b;
    delete aWrapper;
    delete[] a;

    delete cl;
}
Пример #2
0
TEST(SLOW_testlocal, reduceviascratch_multipleworkgroups_ints_3levels) {
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    CLKernel *kernel = cl->buildKernel("testlocal.cl", "reduceViaScratch_multipleworkgroups_ints");
    int workgroupSize = min(512, cl->getMaxWorkgroupSize());
    const int numWorkgroups = workgroupSize;
    const int level3size = numWorkgroups / 4;
    const int N = workgroupSize * numWorkgroups * level3size;
    cout << "numworkgroups " << numWorkgroups << " workgroupsize " << workgroupSize << " N " << N << endl;
    int *myarray = new int[N];
    int sumViaCpu = 0;
    int localSumViaCpu = 0;
    int localSumViaCpu2 = 0;
    int *localSumsViaCpu = new int[numWorkgroups * level3size];
    memset(localSumsViaCpu, 0, sizeof(int)*numWorkgroups* level3size);
    for(int i = 0; i < N; i++) {
        myarray[i] = ((i + 7) * 3) % 50;
        sumViaCpu += myarray[i];
        if(i < workgroupSize) {
            localSumViaCpu += myarray[i];
        }
        if(i >= workgroupSize && i < workgroupSize * 2) {
            localSumViaCpu2 += myarray[i];
        }
        int workgroupId = i / workgroupSize;
        localSumsViaCpu[workgroupId] += myarray[i];
    }
    ASSERT_EQ(localSumViaCpu, localSumsViaCpu[0]);
    ASSERT_EQ(localSumViaCpu2, localSumsViaCpu[1]);
    ASSERT_NE(myarray[0], sumViaCpu);

//    Timer timer;

    CLWrapper *a1wrapper = cl->wrap(N, myarray);
    a1wrapper->copyToDevice();
//    timer.timeCheck("copied array to device");
    int *a2 = new int[numWorkgroups*level3size];
    CLWrapper *a2wrapper = cl->wrap(numWorkgroups * level3size, a2);
    kernel->in(a1wrapper);
    kernel->out(a2wrapper);
    kernel->localInts(workgroupSize);
    kernel->run_1d(N, workgroupSize);

    int *a3 = new int[numWorkgroups];
    CLWrapper *a3wrapper = cl->wrap(level3size, a3);
    kernel->in(a2wrapper);
    kernel->out(a3wrapper);
    kernel->localInts(workgroupSize);
    kernel->run_1d(workgroupSize * level3size, workgroupSize);

    int finalSum;
    kernel->in(a3wrapper);
    kernel->out(1, &finalSum);
    kernel->localInts(level3size);
    kernel->run_1d(level3size, level3size);
//    timer.timeCheck("finished 3-level reduce");

    EXPECT_EQ(sumViaCpu, finalSum);

    delete a1wrapper;
    delete a2wrapper;
    delete a3wrapper;
    delete[] a3;
    delete[] a2;
    delete[]myarray;
    delete kernel;
    delete cl;
}
Пример #3
0
void compareSpecific(int instance0, int instance1, int numIts, int batchSize, LayerDimensions dim) {
    cout << "batchsize=" << batchSize << " " << dim << endl;
    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();
    ClBlasInstance clblasInstance;

    int inputNumElements = dim.inputCubeSize * batchSize;
    int errorsSize = dim.outputCubeSize * batchSize;
    int weightsSize = dim.filtersSize;
    int errorsForUpstreamSize = dim.inputCubeSize * batchSize;

    float *input = new float[inputNumElements];
    float *errors = new float[errorsSize];
    float *weights = new float[weightsSize];
    float *errorsForUpstream0 = new float[errorsForUpstreamSize];
    float *errorsForUpstream1 = new float[errorsForUpstreamSize];

    WeightRandomizer::randomize(0, input, inputNumElements, -0.1f, 0.1f);
    WeightRandomizer::randomize(1, errors, errorsSize, -0.1f, 0.1f);
    WeightRandomizer::randomize(2, weights, weightsSize, -0.1f, 0.1f);

    CLWrapper *inputWrapper = cl->wrap(inputNumElements, input);
    CLWrapper *errorsWrapper = cl->wrap(errorsSize, errors);
    CLWrapper *weightsWrapper = cl->wrap(weightsSize, weights);
    CLWrapper *errorsForUpstreamWrapper0 = cl->wrap(errorsForUpstreamSize, errorsForUpstream0);
    CLWrapper *errorsForUpstreamWrapper1 = cl->wrap(errorsForUpstreamSize, errorsForUpstream1);

    inputWrapper->copyToDevice();
    errorsWrapper->copyToDevice();
    weightsWrapper->copyToDevice();
    errorsForUpstreamWrapper0->createOnDevice();
    errorsForUpstreamWrapper1->createOnDevice();

    Backward *bp0 = Backward::instanceSpecific(instance0, cl, dim);
    Backward *bp1 = Backward::instanceSpecific(instance1, cl, dim);
    
    for(int it=0; it < numIts; it++ ) {
        bp0->backward(batchSize, 
                inputWrapper, errorsWrapper, weightsWrapper,
                errorsForUpstreamWrapper0);
        bp1->backward(batchSize, 
                inputWrapper, errorsWrapper, weightsWrapper,
                errorsForUpstreamWrapper1);

        errorsForUpstreamWrapper0->copyToHost();
        errorsForUpstreamWrapper1->copyToHost();

        int outputNumElements = errorsForUpstreamSize;
        cout << dim << endl;
        bool same = true;
        for(int i = 0; i < max(20, outputNumElements); i++) {
            if(i < outputNumElements) {
                if(abs(errorsForUpstream0[i] - errorsForUpstream1[i]) < 0.000001 || abs(errorsForUpstream0[i] - errorsForUpstream1[i]) <= 0.001 * max(abs(errorsForUpstream0[i]), abs(errorsForUpstream1[i]))) {
                    if(it == 0 && i < 20) {
                        cout << "output[" << i << "]=" << errorsForUpstream0[i] << " " << errorsForUpstream1[i];
                        cout << " SAME";
                    }
                } else {
                    cout << "output[" << i << "]=" << errorsForUpstream0[i] << " " << errorsForUpstream1[i];
                    cout << " DIFF";
                    same = false;
                }
            } else {
                 if(it == 0 && i < 20) {
                     cout << "     ";
                 }
            }
            if(it == 0 && i < 20) {
                cout << "  || " << errorsForUpstream1[100+i] ;
                cout << "  || " << errorsForUpstream1[200+i] ;
                cout << "  || " << errorsForUpstream1[300+i] ;
                cout << "  || " << errorsForUpstream1[400+i] ;
                cout << "  || " << errorsForUpstream1[500+i] ;
                cout << "  || " << errorsForUpstream1[600+i] ;
                cout << "  || " << errorsForUpstream1[700+i] << endl;
            }
        }
        EXPECT_EQ(true, same);
    }

    delete inputWrapper;
    delete errorsWrapper;
    delete weightsWrapper;
    delete errorsForUpstreamWrapper0;
    delete errorsForUpstreamWrapper1;

    delete[] errorsForUpstream0;
    delete[] errorsForUpstream1;
    delete bp0;
    delete bp1;
    delete cl;
    delete[] input;
    delete[] errors;
    delete[] weights;
}
Пример #4
0
void compareSpecific( CompareSpecificArgs args ) {
    cout << "instance0: " << args._instance0 << endl;
    cout << "instance1: " << args._instance1 << endl;

    int batchSize = args._batchSize;
    int numPlanes = args._numPlanes;
    int imageSize = args._imageSize;
    int poolingSize = args._poolingSize;

    EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu();

    PoolingForward *poolingForward0 = PoolingForward::instanceSpecific( args._instance0, cl, args._padZeros, numPlanes, imageSize, poolingSize );
    PoolingForward *poolingForward1 = PoolingForward::instanceSpecific( args._instance1, cl, args._padZeros, numPlanes, imageSize, poolingSize );

    const int inputSize = batchSize * numPlanes * imageSize * imageSize;
    int outputSize = poolingForward0->getOutputSize( batchSize );

    float *input = new float[ inputSize ];
    int *selectors = new int[ outputSize ];
    float *output = new float[ outputSize ];

    CLWrapper *inputWrapper = cl->wrap( inputSize, input );
    CLWrapper *selectorsWrapper = cl->wrap( outputSize, selectors );
    CLWrapper *outputWrapper = cl->wrap( outputSize, output );

    WeightRandomizer::randomize( input, inputSize, -0.1f, 0.1f );

    memset( selectors, 99, sizeof(int) * outputSize );
    memset( output, 99, sizeof(int) * outputSize );

    inputWrapper->copyToDevice();
    selectorsWrapper->copyToDevice();
    outputWrapper->copyToDevice();

    poolingForward0->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper );
    selectorsWrapper->copyToHost();
    outputWrapper->copyToHost();

    int *selectors0 = new int[ outputSize ];
    float *output0 = new float[ outputSize ];
    memcpy( selectors0, selectors, sizeof(int) * outputSize );
    memcpy( output0, output, sizeof(float) * outputSize );
    
    memset( selectors, 99, sizeof(int) * outputSize );
    memset( output, 99, sizeof(int) * outputSize );

    inputWrapper->copyToDevice();
    selectorsWrapper->copyToDevice();
    outputWrapper->copyToDevice();

    poolingForward1->forward( batchSize, inputWrapper, selectorsWrapper, outputWrapper );
    selectorsWrapper->copyToHost();
    outputWrapper->copyToHost();
    
    int numErrors = 0;
    for( int i = 0; i < outputSize; i++ ) {
        if( selectors[i] != selectors0[i] ) {
            cout << "ERROR: selectors[" << i << "] instance0:" << selectors0[i] << " != instance1:" << selectors[i] << endl;
            numErrors++;
        }
        if( output[i] != output0[i] ) {
            cout << "ERROR: output[" << i << "] instance0:" << output0[i] << " != instance1:" << output[i] << endl;
            numErrors++;
        }
        if( numErrors >= 10 ) {
            cout << "More than 10 errors. Skipping the rest :-)" << endl;
            break;
        }
    }
    EXPECT_EQ( 0, numErrors );
    if( numErrors > 0 ) {
        int num2dPlanes = inputSize / imageSize / imageSize;
        for( int plane = 0; plane < num2dPlanes; plane++ ) {
            cout << "2dplane " << plane << ":" << endl;
            for( int i = 0; i < imageSize; i++ ) {
                string line = "";
                for( int j = 0; j < imageSize; j++ ) {
                    line += toString( input[ plane * imageSize * imageSize + i * imageSize + j] ) + " ";
                }
                cout << line << endl;
            }
            cout << endl;
        }
    }

    delete inputWrapper;
    delete selectorsWrapper;
    delete outputWrapper;
    delete poolingForward0;
    delete poolingForward1;
    delete[] selectors0;
    delete[] output0;
    delete[] selectors;
    delete[] output;
    delete[] input;
    delete cl;
}
Пример #5
0
float THClBlas_dot(THClState *state, long n, 
    CLWrapper *xwrapper, long xoffset, long incx, 
    CLWrapper *ywrapper, long yoffset, long incy)
{
  StatefulTimer::timeCheck("THClBlas_dot START");
  if(n == 1)
  {
    incx = 1;
    incy = 1;
  }

  if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) )
  {
    int i_n = (int)n;
    int i_incx = (int)incx;
    int i_incy = (int)incy;
    float result;

    cl_int err;

    EasyCL *cl = ywrapper->getCl();
    CLWrapper *resultWrapper = cl->wrap( 1, &result );
    float *scratch = new float[i_n];
    CLWrapper *scratchWrapper = cl->wrap(i_n, scratch);
    scratchWrapper->createOnDevice();
    resultWrapper->createOnDevice();

    cl_event *event = 0;
//    if(state->addFinish) {
      event = new cl_event();
//    }
    err = clblasSdot( i_n, resultWrapper->getBuffer(), 0, 
          xwrapper->getBuffer(), xoffset, i_incx, 
          ywrapper->getBuffer(), yoffset, i_incy, 
          scratchWrapper->getBuffer(),
          1, cl->queue, 0, NULL, event);
    if (err != CL_SUCCESS) {
        THError("clblasSdot() failed with %d", err);
    }
    else {
//      if(state->addFinish) {
        err = clWaitForEvents(1, event);
        if (err != CL_SUCCESS) {
//              throw runtime_error("clblasSger() failed with " + easycl::toString(err));
          THError("clblasSger: wait for event failed with %d", err);
        }
        clReleaseEvent(*event);
        delete event;
//      }
    }
    // TODO: NOT copy to host here, use pointtensor
    resultWrapper->copyToHost();
    //resultWrapper->markDeviceDirty();

    delete resultWrapper;
    delete scratchWrapper;
    delete[] scratch;
    StatefulTimer::timeCheck("THClBlas_dot END");
    return result;
  }
  THError("Cublas_dot only supports n, incx and incy "
          "upto signed integer limits: %d", INT_MAX);
    THError("Not implemented");
  return -1;
}