TEST( SLOW_testintwrapper_huge, testreadwrite ) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); CLKernel *kernel = cl->buildKernel("testeasycl.cl", "test_stress"); const int N = 1000000; int *in = new int[N]; for( int i = 0; i < N; i++ ) { in[i] = i * 3; } int *out = new int[N]; CLWrapper *inwrapper = cl->wrap(N, in); CLWrapper *outwrapper = cl->wrap(N, out); inwrapper->copyToDevice(); outwrapper->createOnDevice(); kernel->input( inwrapper ); kernel->output( outwrapper ); int globalSize = N; int workgroupsize = cl->getMaxWorkgroupSize(); globalSize = ( ( globalSize + workgroupsize - 1 ) / workgroupsize ) * workgroupsize; cout << "globalsize: " << globalSize << " workgroupsize " << workgroupsize << endl; kernel->run_1d( globalSize, workgroupsize ); outwrapper->copyToHost(); for( int i = 0; i < N; i++ ) { if( out[i] != 689514 ) { cout << "out[" << i << "] != 689514: " << out[i] << endl; exit(-1); } } delete outwrapper; delete inwrapper; delete kernel; delete cl; }
VIRTUAL void Adagrad::updateWeights(CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper, AdagradState *trainerState) { int numWeights = trainerState->numWeights; float *working = new float[ numWeights ]; CLWrapper *workingWrapper = cl->wrap(numWeights, working); workingWrapper->createOnDevice(); CLMathWrapper clWeights(weightsWrapper); CLMathWrapper clGradWeights(gradWeightsWrapper); CLMathWrapper clSumSquares(trainerState->sumSquaresWrapper); CLMathWrapper clWorking(workingWrapper); // following all happens on gpu, via clmathwrapper: clWorking = clGradWeights; clWorking.squared(); clSumSquares += clWorking; clWorking = clSumSquares; clWorking.sqrt(); clWorking.inv(); clWorking *= clGradWeights; clWorking *= - learningRate; clWeights += clWorking; delete workingWrapper; delete[] working; }
void CopyBuffer::copy( EasyCL *cl, CLWrapper *sourceWrapper, int *target ) { // first we will copy it to another buffer, so we can copy it out int bufferSize = sourceWrapper->size(); // float *copiedBuffer = new float[ bufferSize ]; CLWrapper *targetWrapper = cl->wrap( bufferSize, target ); targetWrapper->createOnDevice(); // now copy it, via a kernel const string kernelSource = "\n" "kernel void copy( int N, global int const *source, global int *dest ) {\n" " #define globalId ( get_global_id(0) )\n" " if( (int)globalId < N ) {\n" " dest[globalId] = source[globalId];\n" " }\n" " }\n"; CLKernel *kernel = cl->buildKernelFromString( kernelSource, "copy", "" ); kernel->in( bufferSize )->in( sourceWrapper )->out( targetWrapper ); int workgroupSize = 32; int numWorkgroups = ( bufferSize + workgroupSize - 1 ) / workgroupSize; kernel->run_1d( numWorkgroups * workgroupSize, workgroupSize ); cl->finish(); targetWrapper->copyToHost(); delete targetWrapper; delete kernel; // delete[] copiedBuffer; }
THClStorage* THClStorage_newWithSize(THClState *state, int device, long size) { THArgCheck(size >= 0, 2, "invalid size"); if(size > 0) { StatefulTimer::timeCheck("THClStorage_newWithSize START"); THClStorage *storage = (THClStorage*)THAlloc(sizeof(THClStorage)); float *data = new float[size]; storage->device = device; storage->cl = THClState_getClv2(state, storage->device); CLWrapper *wrapper = storage->cl->wrap( size, data ); if(state->trace) cout << "new wrapper, size " << size << endl; if(state->trace) cout << "wrapper->createOnDevice()" << endl; wrapper->createOnDevice(); storage->data = data; storage->wrapper = wrapper; storage->size = size; storage->refcount = 1; storage->flag = TH_STORAGE_REFCOUNTED | TH_STORAGE_RESIZABLE | TH_STORAGE_FREEMEM; StatefulTimer::timeCheck("THClStorage_newWithSize END"); return storage; } else { return THClStorage_newv2(state, device); } }
void measurePerf(int instance, int batchSize, LayerDimensions dim) { EasyCL *cl = EasyCL::createForFirstGpuOtherwiseCpu(); int inputNumElements = dim.inputCubeSize * batchSize; int errorsSize = dim.outputCubeSize * batchSize; int weightsSize = dim.filtersSize; int errorsForUpstreamSize = dim.inputCubeSize * batchSize; float *input = new float[inputNumElements]; float *errors = new float[errorsSize]; float *weights = new float[weightsSize]; WeightRandomizer::randomize(input, inputNumElements, -0.1f, 0.1f); WeightRandomizer::randomize(errors, errorsSize, -0.1f, 0.1f); WeightRandomizer::randomize(weights, weightsSize, -0.1f, 0.1f); float *errorsForUpstream = new float[errorsForUpstreamSize]; CLWrapper *inputWrapper = cl->wrap(inputNumElements, input); CLWrapper *errorsWrapper = cl->wrap(errorsSize, errors); CLWrapper *weightsWrapper = cl->wrap(weightsSize, weights); CLWrapper *errorsForUpstreamWrapper = cl->wrap(errorsForUpstreamSize, errorsForUpstream); inputWrapper->copyToDevice(); errorsWrapper->copyToDevice(); weightsWrapper->copyToDevice(); errorsForUpstreamWrapper->createOnDevice(); StatefulTimer::timeCheck("after init"); Backward *backwardImpl = Backward::instanceSpecific(instance, cl, dim); for(int it = 0; it < 40; it++) { backwardImpl->backward(batchSize, inputWrapper, errorsWrapper, weightsWrapper, errorsForUpstreamWrapper); } StatefulTimer::timeCheck("after backprop"); StatefulTimer::dump(true); delete errorsForUpstreamWrapper; delete weightsWrapper; delete inputWrapper; delete errorsWrapper; delete[] errors; delete[] weights; delete[] input; delete[] errorsForUpstream; delete backwardImpl; delete cl; }
TEST( testMemset, basic ) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); CLKernel *kMemset = 0; // [[[cog // import stringify // stringify.write_kernel2( "kMemset", "cl/memset.cl", "cl_memset", '""' ) // ]]] // generated using cog, from cl/memset.cl: const char * kMemsetSource = "// Copyright Hugh Perkins 2015 hughperkins at gmail\n" "//\n" "// This Source Code Form is subject to the terms of the Mozilla Public License,\n" "// v. 2.0. If a copy of the MPL was not distributed with this file, You can\n" "// obtain one at http://mozilla.org/MPL/2.0/.\n" "\n" "kernel void cl_memset(global float *target, const float value, const int N) {\n" " #define globalId get_global_id(0)\n" " if ((int)globalId < N) {\n" " target[globalId] = value;\n" " }\n" "}\n" "\n" ""; kMemset = cl->buildKernelFromString(kMemsetSource, "cl_memset", "", "cl/memset.cl"); // [[[end]]] int N = 10000; float *myArray = new float[N]; CLWrapper *myArrayWrapper = cl->wrap( N, myArray ); myArrayWrapper->createOnDevice(); kMemset->out( myArrayWrapper )->in( 99.0f )->in( N ); int workgroupSize = 64; kMemset->run_1d( ( N + workgroupSize - 1 ) / workgroupSize * workgroupSize, workgroupSize ); cl->finish(); myArrayWrapper->copyToHost(); for( int i = 0; i < 10; i++ ) { // cout << "myArray[" << i << "]=" << myArray[i] << endl; } for( int i = 0; i < N; i++ ) { EXPECT_EQ( 99.0f, myArray[i] ); } delete kMemset; delete cl; }
TEST(testClBlas, basic) { EasyCL *cl = DeepCLGtestGlobals_createEasyCL(); float A[] = {1, 3, 2, 7, 9, 5}; float B[] = {3, -1}; float C[3]; ClBlasInstance clblasInstance; CLWrapper *AWrap = cl->wrap(6, A); CLWrapper *BWrap = cl->wrap(2, B); CLWrapper *CWrap = cl->wrap(3, C); AWrap->copyToDevice(); BWrap->copyToDevice(); CWrap->createOnDevice(); ClBlasHelper::Gemm( cl, clblasRowMajor, clblasNoTrans, clblasNoTrans, 3, 2, 1, 1, AWrap, 0, BWrap, 0, 0, CWrap, 0 ); cl->finish(); CWrap->copyToHost(); EXPECT_EQ(0, C[0]); EXPECT_EQ(-1, C[1]); EXPECT_EQ(22, C[2]); cl->finish(); delete CWrap; delete BWrap; delete AWrap; cl->finish(); delete cl; clblasTeardown(); }
VIRTUAL void Adadelta::updateWeights( CLWrapper *weightsWrapper, CLWrapper *gradWeightsWrapper, AdadeltaState *trainerState ) { // need to calculate // sumGradSquared = decay * sumGradSquared + (1 - decay ) * grad.square() // update = - sumUpdateSquared.sqrt() / sumGradSquared.sqrt() * grad // sumUpdateSquared = decay * sumUpdateSquared + ( 1 - decay ) * update.squared() // weights += update int numWeights = trainerState->numWeights; float *working = new float[ numWeights ]; CLWrapper *workingWrapper = cl->wrap( numWeights, working ); workingWrapper->createOnDevice(); CLMathWrapper clWeights( weightsWrapper ); CLMathWrapper clGradWeights( gradWeightsWrapper ); CLMathWrapper clSumGradSquared( trainerState->sumGradSquaredWrapper ); CLMathWrapper clSumUpdateSquared( trainerState->sumUpdateSquaredWrapper ); CLMathWrapper clWorking( workingWrapper ); // following all happens on gpu, via clmathwrapper: clWorking = clGradWeights; clWorking.squared(); clWorking *= ( 1 - decay ); clSumGradSquared *= decay; clSumGradSquared += clWorking; clWorking = clSumGradSquared; clWorking.inv(); clWorking *= clSumUpdateSquared; clWorking.sqrt(); clWorking *= clGradWeights; clWorking *= - 1; clWeights += clWorking; clSumUpdateSquared *= decay; clWorking.squared(); clWorking *= ( 1 - decay ); clSumUpdateSquared += clWorking; delete workingWrapper; delete[] working; }
float THClBlas_dot(THClState *state, long n, CLWrapper *xwrapper, long xoffset, long incx, CLWrapper *ywrapper, long yoffset, long incy) { if(n == 1) { incx = 1; incy = 1; } if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) { int i_n = (int)n; int i_incx = (int)incx; int i_incy = (int)incy; float result; cl_int err; err = clblasSetup(); if (err != CL_SUCCESS) { THError("clblasSetup() failed with %d", err); } CLWrapper *resultWrapper = state->cl->wrap( 1, &result ); float *scratch = new float[i_n]; CLWrapper *scratchWrapper = state->cl->wrap(i_n, scratch); scratchWrapper->createOnDevice(); resultWrapper->createOnDevice(); cl_event event = NULL; err = clblasSdot( i_n, resultWrapper->getBuffer(), 0, xwrapper->getBuffer(), xoffset, i_incx, ywrapper->getBuffer(), yoffset, i_incy, scratchWrapper->getBuffer(), 1, state->cl->queue, 0, NULL, &event); // THCublasCheck(cublasSdot(*state->blasState->current_handle, i_n, x, i_incx, y, i_incy, &result)); // ClDeviceSynchronize(); if (err != CL_SUCCESS) { THError("clblasSdot() failed with %d", err); } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); } resultWrapper->copyToHost(); /* Finalize work with clblas. */ clblasTeardown(); delete resultWrapper; delete scratchWrapper; delete[] scratch; return result; } THError("Cublas_dot only supports n, incx and incy " "upto signed integer limits: %d", INT_MAX); THError("Not implemented"); return -1; }
void THClBlas_gemm2(THClState *state, char orderchar, char transa, char transb, long m, long n, long k, float alpha, THClTensor *a, long lda, THClTensor *b, long ldb, float beta, THClTensor *c, long ldc) { StatefulTimer::timeCheck("THClBlas_gemm START"); CLWrapper *aWrapper = THClTensor_wrapper(state, a); CLWrapper *bWrapper = THClTensor_wrapper(state, b); CLWrapper *cWrapper = THClTensor_wrapper(state, c); long offseta = THClTensor_storageOffset(state, a); long offsetb = THClTensor_storageOffset(state, b); long offsetc = THClTensor_storageOffset(state, c); // adjustLd(transa, transb, m, n, k, &lda, &ldb, &ldc); clblasTranspose opa = convertTransToClblasOperation(transa); clblasTranspose opb = convertTransToClblasOperation(transb); clblasOrder order = orderchar == 'c' ? clblasColumnMajor : clblasRowMajor; if( (m <= INT_MAX) && (n <= INT_MAX) && (k <= INT_MAX) && (lda <= INT_MAX) && (ldb <= INT_MAX) && (ldc <= INT_MAX) ) { int i_m = (int)m; int i_n = (int)n; int i_k = (int)k; int i_lda = (int)lda; int i_ldb = (int)ldb; int i_ldc = (int)ldc; cl_int err; if( !aWrapper->isOnDevice() ) { aWrapper->createOnDevice(); } if( !bWrapper->isOnDevice() ) { bWrapper->createOnDevice(); } if( !cWrapper->isOnDevice() ) { cWrapper->createOnDevice(); } EasyCL *cl = cWrapper->getCl(); cl_event *event = 0; if(state->addFinish) { event = new cl_event(); } err = clblasSgemm(order, opa, opb, i_m, i_n, i_k, alpha, aWrapper->getBuffer(), offseta, i_lda, bWrapper->getBuffer(), offsetb, i_ldb, beta, cWrapper->getBuffer(), offsetc, i_ldc, 1, cl->queue, 0, NULL, event); if (err != CL_SUCCESS) { THError("clblasSgemm() failed with %d", err); } else { if(state->addFinish) { err = clWaitForEvents(1, event); if (err != CL_SUCCESS) { // throw runtime_error("clblasSger() failed with " + easycl::toString(err)); THError("clblasSger: wait for event failed with %d", err); } clReleaseEvent(*event); delete event; } } cWrapper->markDeviceDirty(); StatefulTimer::timeCheck("THClBlas_gemm END"); return; } THError("Clblas_gemm only supports m, n, k, lda, ldb, ldc" "with the bound [val] <= %d", INT_MAX); }
void THClBlas_ger(THClState *state, long m, long n, float alpha, THClTensor *x, long incx, THClTensor *y, long incy, THClTensor *a, long lda) { StatefulTimer::timeCheck("THClBlas_ger START"); if(n == 1) lda = m; if( (m <= INT_MAX) && (n <= INT_MAX) && (lda <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) { int i_m = (int)m; int i_n = (int)n; int i_lda = (int)lda; int i_incx = (int)incx; int i_incy = (int)incy; cl_int err; CLWrapper *xwrap = THClTensor_wrapper(state, x); CLWrapper *ywrap = THClTensor_wrapper(state, y); CLWrapper *awrap = THClTensor_wrapper(state, a); long x_offset = THClTensor_storageOffset(state, x); long y_offset = THClTensor_storageOffset(state, y); long a_offset = THClTensor_storageOffset(state, a); if(!awrap->isOnDevice()) { awrap->createOnDevice(); } EasyCL *cl = ywrap->getCl(); cl_event *event = 0; if(state->addFinish) { event = new cl_event(); } err = clblasSger(clblasColumnMajor, i_m, i_n, alpha, xwrap->getBuffer(), x_offset, i_incx, ywrap->getBuffer(), y_offset, i_incy, awrap->getBuffer(), a_offset, i_lda, 1, (cl->queue), 0, NULL, event); if (err != CL_SUCCESS) { // throw runtime_error("clblasSger() failed with " + easycl::toString(err)); THError("clblasSger() failed with %d", err); } else { if(state->addFinish) { err = clWaitForEvents(1, event); if (err != CL_SUCCESS) { // throw runtime_error("clblasSger() failed with " + easycl::toString(err)); THError("clblasSger: wait for event failed with %d", err); } clReleaseEvent(*event); delete event; } } awrap->markDeviceDirty(); StatefulTimer::timeCheck("THClBlas_ger END"); return; } THError("Cublas_ger only supports m, n, lda, incx, incy" "with the bound [val] <= %d", INT_MAX); }
float THClBlas_dot(THClState *state, long n, CLWrapper *xwrapper, long xoffset, long incx, CLWrapper *ywrapper, long yoffset, long incy) { StatefulTimer::timeCheck("THClBlas_dot START"); if(n == 1) { incx = 1; incy = 1; } if( (n <= INT_MAX) && (incx <= INT_MAX) && (incy <= INT_MAX) ) { int i_n = (int)n; int i_incx = (int)incx; int i_incy = (int)incy; float result; cl_int err; EasyCL *cl = ywrapper->getCl(); CLWrapper *resultWrapper = cl->wrap( 1, &result ); float *scratch = new float[i_n]; CLWrapper *scratchWrapper = cl->wrap(i_n, scratch); scratchWrapper->createOnDevice(); resultWrapper->createOnDevice(); cl_event *event = 0; // if(state->addFinish) { event = new cl_event(); // } err = clblasSdot( i_n, resultWrapper->getBuffer(), 0, xwrapper->getBuffer(), xoffset, i_incx, ywrapper->getBuffer(), yoffset, i_incy, scratchWrapper->getBuffer(), 1, cl->queue, 0, NULL, event); if (err != CL_SUCCESS) { THError("clblasSdot() failed with %d", err); } else { // if(state->addFinish) { err = clWaitForEvents(1, event); if (err != CL_SUCCESS) { // throw runtime_error("clblasSger() failed with " + easycl::toString(err)); THError("clblasSger: wait for event failed with %d", err); } clReleaseEvent(*event); delete event; // } } // TODO: NOT copy to host here, use pointtensor resultWrapper->copyToHost(); //resultWrapper->markDeviceDirty(); delete resultWrapper; delete scratchWrapper; delete[] scratch; StatefulTimer::timeCheck("THClBlas_dot END"); return result; } THError("Cublas_dot only supports n, incx and incy " "upto signed integer limits: %d", INT_MAX); THError("Not implemented"); return -1; }