clblasFunc(StatisticalTimer& _timer, cl_device_type devType) : timer(_timer) { cl_int err; /* Setup OpenCL environment. */ OPENCL_V_THROW(clGetPlatformIDs(1, &platform_, NULL), "getting platform IDs"); OPENCL_V_THROW(clGetDeviceIDs(platform_, devType, 1, &device_, NULL), "getting device IDs"); props_[0] = CL_CONTEXT_PLATFORM; props_[1] = (cl_context_properties)platform_; props_[2] = 0; ctx_ = clCreateContext(props_, 1, &device_, NULL, NULL, &err); OPENCL_V_THROW(err, "creating context"); queue_ = clCreateCommandQueue(ctx_, device_, 0, &err); timer_id = timer.getUniqueID( "clfunc", 0 ); maxMemAllocSize = queryMemAllocSize( device_ ); /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { std::cerr << "clblasSetup() failed with %d\n"; clReleaseCommandQueue(queue_); clReleaseContext(ctx_); } }
cl_int Dgemm_internal( cl_env *env, double *a, double *b, double *c, double alpha, double beta, clblasTranspose transA, clblasTranspose transB, int ar, int ac, int br, int bc, int cr, int cc, int size_a, int size_b, int size_c) { CHECK(clblasSetup()); cl_event events[NEVENTS]; int nevent = 0; cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_b = create_mem(env, b, size_b, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_c; if (beta != 0) mem_c = create_mem(env, c, size_c, CL_MEM_READ_WRITE, &(events[nevent++])); else mem_c = create_mem(env, NULL, size_c, CL_MEM_READ_WRITE, NULL); cl_int err = clblasDgemm(clblasColumnMajor, transA, transB, ar, bc, ac, alpha, mem_a, 0, ar, mem_b, 0, br, beta, mem_c, 0, cr, 1, &(env->queues[0]), nevent, events, &(events[nevent])); CHECK(err); events[nevent+1] = *read_mem(env, mem_c, c, size_c, 1, &(events[nevent])); CHECK(clWaitForEvents(1, &(events[nevent+1]))); CHECK(clReleaseMemObject(mem_a)); CHECK(clReleaseMemObject(mem_b)); CHECK(clReleaseMemObject(mem_c)); clblasTeardown(); return CL_SUCCESS; }
static int setup(gpucontext *ctx) { if (refcnt == 0) { CLB_CHECK(ctx->err, clblasSetup()); } if (ctx->blas_handle == NULL) ctx->blas_handle = &refcnt; refcnt++; return GA_NO_ERROR; }
// ======================================== // initialization // -------------------- extern "C" magma_int_t magma_init() { g_runtime.init(); g_runtime.load_kernels( 1, &clmagma_kernels ); gContext = g_runtime.get_context(); cl_int err = clblasSetup(); check_error( err ); g_event = NULL; return err; }
static int setup(void *c) { cl_ctx *ctx = (cl_ctx *)c; clblasStatus err; if (refcnt == 0) { err = clblasSetup(); if (err != clblasSuccess) return GA_BLAS_ERROR; } if (ctx->blas_handle == NULL) ctx->blas_handle = &refcnt; refcnt++; return GA_NO_ERROR; }
void InitJTorch(const bool use_cpu, const uint32_t requested_deviceid, const bool verbose_startup) { std::lock_guard<std::mutex> lck(cl_context_lock_); // Check we haven't already called init. RASSERT(cl_context == nullptr); if (verbose_startup) { std::cout << "Valid OpenCL devices attached:" << std::endl; const uint32_t num_devices = jcl::OpenCLContext::printDevices(); static_cast<void>(num_devices); } jcl::CLDevice device = use_cpu ? jcl::CLDeviceCPU : jcl::CLDeviceGPU; jcl::CLVendor vendor = jcl::CLVendorAny; const bool device_exists = jcl::OpenCLContext::queryDeviceExists(device, vendor); if (!device_exists) { if (use_cpu) { std::cerr << "No CPU devices attached."; } else { std::cerr << "No GPU devices attached."; } } RASSERT(device_exists); // Otherwise, initialize the context. cl_context.reset(new jcl::OpenCLContext()); cl_context->init(device, jcl::CLVendorAny, verbose_startup); // Make sure the user is requesting a device id that exists. RASSERT(requested_deviceid < cl_context->getNumDevices()); deviceid = requested_deviceid; std::cout << "Jtorch is using device " << deviceid << ": " << cl_context->getDeviceName(deviceid) << std::endl; // Startup clblas. // TODO(tompson): I have NO idea what device ID this will run on. const cl_int blas_ret = clblasSetup(); const bool blas_ok = (blas_ret == CL_SUCCESS); if (!blas_ok) { std::cout << "ERROR - InitJTorchInternal: clblasSetup returned error: " << jcl::OpenCLContext::getErrorString(blas_ret); } RASSERT(blas_ok); }
cl_int Dtrmm_internal( cl_env *env, double *a, double *b, double alpha, clblasSide side, clblasTranspose transA, clblasUplo uplo, clblasDiag diag, int ar, int ac, int br, int bc, int size_a, int size_b) { CHECK(clblasSetup()); cl_event events[NEVENTS]; int nevent = 0; cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_b = create_mem(env, b, size_b, CL_MEM_READ_WRITE, &(events[nevent++])); cl_int err = clblasDtrmm(clblasColumnMajor, side, uplo, transA, diag, br, bc, alpha, mem_a, 0, ar, mem_b, 0, br, 1, &(env->queues[0]), nevent, events, &(events[nevent])); CHECK(err); events[nevent+1] = *read_mem(env, mem_b, b, size_b, 1, &(events[nevent])); CHECK(clWaitForEvents(1, &(events[nevent+1]))); CHECK(clReleaseMemObject(mem_a)); CHECK(clReleaseMemObject(mem_b)); clblasTeardown(); return CL_SUCCESS; }
void Caffe::SetDevice(const int device_id) { std::vector<int> devices; devices.push_back(device_id); Caffe::SetDevices(devices); Get().default_device_context_ = GetDeviceContext(device_id); if (Get().default_device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) { return; } // The call to cudaSetDevice must come before any calls to Get, which // may perform initialization using the GPU. CUDA_CHECK(cudaSetDevice(device_id)); if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) { CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); } CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); CURAND_CHECK( curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK( curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); #endif // USE_CUDA } else { #ifdef USE_GREENTEA #ifdef USE_CLBLAS clblasSetup(); #endif // USE_CLBLAS #endif // USE_GREENTEA } }
cl_int Dsyrk_internal( cl_env *env, double *a, double *c, double alpha, double beta, clblasTranspose transA, clblasUplo uplo, int ar, int ac, int n, int size_a, int size_c) { CHECK(clblasSetup()); cl_event events[NEVENTS]; int nevent = 0; cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_c; if (beta != 0) mem_c = create_mem(env, c, size_c, CL_MEM_READ_WRITE, &(events[nevent++])); else mem_c = create_mem(env, NULL, size_c, CL_MEM_READ_WRITE, NULL); int k = transA == clblasNoTrans ? ar : ac; cl_int err = clblasDsyrk(clblasColumnMajor, uplo, transA, n, k, alpha, mem_a, 0, ac, beta, mem_c, 0, n, 1, &(env->queues[0]), nevent, events, &(events[nevent])); CHECK(err); events[nevent+1] = *read_mem(env, mem_c, c, size_c, 1, &(events[nevent])); CHECK(clWaitForEvents(1, &(events[nevent+1]))); CHECK(clReleaseMemObject(mem_a)); CHECK(clReleaseMemObject(mem_c)); clblasTeardown(); return CL_SUCCESS; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY, bufDotP, scratchBuff; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenY*sizeof(cl_float)), NULL, &err); // Allocate 1 element space for dotProduct bufDotP = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err); // Allocate minimum of N elements scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasSdot( N, bufDotP, 0, bufX, 0, incx, bufY, 0, incy, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSdot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufDotP, CL_TRUE, 0, sizeof(cl_float), &dotProduct, 0, NULL, NULL); printf("Result dot product: %f\n", dotProduct); } /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufDotP); clReleaseMemObject(scratchBuff); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
bool OpenCLManager::Init() { if (instance_.initialized_) { return true; } LOG(INFO) << "Initialize OpenCL"; instance_.Query(); if ( OpenCLManager::GetNumPlatforms() <= 0 ) { LOG(FATAL) << "No OpenCL platforms found."; return false; } // TODO: mechanism for choosing the correct platform. instance_.current_platform_index_ = 0; std::tr1::shared_ptr<OpenCLPlatform> pf = CurrentPlatform(); pf->print(); if (!pf->createContext()) { LOG(FATAL) << "failed to create OpenCL context for platform " << pf->name(); return false; } std::vector<std::string> cl_files; cl_files.push_back("src/caffe/util/OpenCL/math_functions.cl"); cl_files.push_back("src/caffe/util/OpenCL/gemm.cl"); cl_files.push_back("src/caffe/util/OpenCL/im2col.cl"); cl_files.push_back("src/caffe/layers/OpenCL/pooling_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/relu_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/prelu_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/sigmoid_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/tanh_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/dropout_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/bnll_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/contrastive_loss_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/eltwise_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/lrn_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/softmax_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/softmax_loss_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/threshold_layer.cl"); cl_files.push_back("src/caffe/layers/OpenCL/mvn_layer.cl"); std::vector<std::string>::iterator it; for ( it = cl_files.begin(); it != cl_files.end(); it++ ) { if ( !pf->compile(*it) ) { LOG(FATAL) << "failed to create to create OpenCL program for platform " << pf->name(); return false; } } if ( pf->getNumGPUDevices() < 1 ) { LOG(FATAL) << "No GPU devices available at platform " << pf->name(); return false; } pf->SetCurrentDevice(CL_DEVICE_TYPE_GPU, instance_.device_id_); OpenCLDevice& device = pf->CurrentDevice(); if (!device.createQueue()) { LOG(FATAL) << "failed to create OpenCL command queue for device " << device.name(); return false; } if ( clblasSetup() != CL_SUCCESS ) { LOG(FATAL) << "failed to initialize clBlas"; return false; } device.print(); instance_.initialized_ = true; return true; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(*A), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(*Y), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*Y), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasZhemv(order, uplo, N, alpha, bufA, 0 /*offA */, lda, bufX, 0 /*offx*/, incx, beta, bufY, 0 /*offx*/, incy, 1, &queue, 0, NULL, &event); // blasZhemv(order, uplo, N, alpha, (DoubleComplex*)A, 0, lda, (DoubleComplex*)X, 0, incx, beta, (DoubleComplex*)Y, 0, incy); // err = CL_SUCCESS; //err = clblasZtrmv(order, uplo, clblasNoTrans, clblasNonUnit, N, bufA, 0 /*offA */, lda, // bufX, 0 /*offx*/, incx, // bufY, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasZhemv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); printResult(); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*Y), Y, 0, NULL, NULL); /* At this point you will get the result of SSYMV placed in Y array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clBLAS */ err = clblasSetup( ); /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A), NULL, &err ); bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B), NULL, &err ); bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err ); err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0, M * K * sizeof( *A ), A, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0, K * N * sizeof( *B ), B, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0, M * N * sizeof( *C ), C, 0, NULL, NULL ); /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */ err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, M, N, K, alpha, bufA, 0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event ); /* Wait for calculations to be finished. */ err = clWaitForEvents( 1, &event ); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL ); /* Release OpenCL memory objects. */ clReleaseMemObject( bufC ); clReleaseMemObject( bufB ); clReleaseMemObject( bufA ); /* Finalize work with clBLAS */ clblasTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX, bufY; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numElementsAP * sizeof(cl_float)), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float), Y, 0, NULL, NULL); err = clblasSspr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy, bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSspr2() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_float)), AP, 0, NULL, NULL); /* At this point you will get the result of SSPR2 placed in A array. */ printResult(); } /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufAP); clReleaseMemObject(bufY); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } // print device name int valueSize=0; clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &valueSize); char * value = (char*) malloc(valueSize); clGetDeviceInfo(device, CL_DEVICE_NAME, valueSize, value, NULL); printf("Device: %sn\n", value); free(value); props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K * N * sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas extended function. Perform gemm for the lower right sub-matrices */ err = clblasSgemm(order, transA, transB, M - off, N - off, K - off, alpha, bufA, offA, lda, bufB, offB, ldb, beta, bufC, offC, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSgemmEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of SGEMM placed in 'result' array. */ puts(""); printResult("clblasSgemmEx result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA, const void *inMatrixB, int nrowB, int ncolB, bool transposeB, double alpha, double beta, void *outMatrix, bool use_float) { std::stringstream result; float *input_matrixA_f = (float *)inMatrixA; float *input_matrixB_f = (float *)inMatrixB; float *output_matrix_f = (float *)outMatrix; double *input_matrixA_d = (double *)inMatrixA; double *input_matrixB_d = (double *)inMatrixB; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrixA = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixA:" << std::endl; } if (use_float) { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(float), input_matrixA_f, &err); } else { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(double), input_matrixA_d, &err); } } cl_mem cl_input_matrixB = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixB:" << std::endl; } if (use_float) { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(float), input_matrixB_f, &err); } else { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(double), input_matrixB_d, &err); } } int nrowC = transposeA ? ncolA : nrowA; int ncolC = transposeB ? nrowB : ncolB; cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const int lda = nrowA; // first dimension of A (rows), before any transpose const int ldb = nrowB; // first dimension of B (rows), before any transpose const int ldc = nrowC; // first dimension of C (rows) const int M = transposeA ? ncolA : nrowA; // rows in A (after transpose, if any) and C const int N = transposeB ? nrowB : ncolB; // cols in B (after transpose, if any) and C const int K = transposeA ? nrowA : ncolA; // cols in A and rows in B (after transposes, if any) const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans; const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSgemm:" << std::endl; } status = clblasSgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDgemm:" << std::endl; } status = clblasDgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrixA); cl_input_matrixA = NULL; clReleaseMemObject(cl_input_matrixB); cl_input_matrixB = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; return errorStatus; }
SpatialSEIR::OCLProvider::OCLProvider() { lssCout << "Setting up OpenCL Interface\n"; try { // Allocate space for platforms and current config platforms = new std::vector<PlatformContainer*>(); currentPlatform = new cl::Platform*; currentContext = new cl::Context*; currentDevice = new DeviceContainer*; R_star_args = new FC_R_Star_KernelData(); R_star_args -> totalWorkUnits = -1; p_se_args = new P_SE_Calculation_KernelData(); p_se_args -> totalWorkUnits = -1; // Allocate space for kernels test_kernel = new cl::Kernel(); R_Star_kernel = new cl::Kernel(); p_se_kernel1 = new cl::Kernel(); p_se_kernel2 = new cl::Kernel(); // Build platforms, devices, contexts cl_uint i; std::vector<cl::Platform> *pformVec = new std::vector<cl::Platform>; cl::Platform::get(pformVec); PlatformContainer* newPlatform; for (i = 0; i < pformVec -> size(); i++) { newPlatform = new PlatformContainer((&(*pformVec)[i])); platforms -> push_back(newPlatform); } // Initialize clBLAS library clblasStatus err = clblasSetup(); if (err != CL_SUCCESS) { lssCout << "Error setting up clBLAS library: " << err << "\n"; throw(-1); } // Flag for existence of current<item>s isSetup = new int; *isSetup = 0; } catch(cl::Error e) { cout << "Problem getting platforms:" << endl; cout << e.what() << ": Error Code " << e.err() << endl; throw(-1); } // Create Kernels // Dummy code to pick device 0,0 setDevice(0,0); buildKernels(); test(); }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); printResult(); /* Call clblas function. */ err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event); // printf("here\n"); if (err != CL_SUCCESS) { printf("clblasSrot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROT placed in vector Y. */ printResult(); } //printf("here\n"); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufC, bufB; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*B), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C), NULL, &err); if ((bufA == NULL) || (bufC == NULL) || (bufB == NULL)) { printf("Failed to create buffern"); return 1; } err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, N * K * sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas function. */ err = clblasCher2k(order, uplo, transA, N, K, alpha, bufA, 0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasCher2k() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); /* At this point you will get the result of SSYRK placed in C array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
ErrorStatus crossprod_clblas(cl_device_id device, void *inMatrix, void *outMatrix, int nrow, int ncol, bool use_float) { std::stringstream result; float *input_matrix_f = (float *)inMatrix; float *output_matrix_f = (float *)outMatrix; double *input_matrix_d = (double *)inMatrix; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "crossprod_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ", nrow = " << nrow << ", ncol = " << ncol << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrix:" << std::endl; } if (use_float) { cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrow * ncol * sizeof(float), input_matrix_f, &err); } else { cl_input_matrix = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrow * ncol * sizeof(double), input_matrix_d, &err); } } cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, ncol * ncol * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, ncol * ncol * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = clblasTrans; const size_t lda = nrow; const size_t ldc = ncol; const cl_float alpha = 1.0; clblasUplo uplo = clblasUpper; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSsyrk:" << std::endl; } status = clblasSsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDsyrk:" << std::endl; } status = clblasDsyrk(order, uplo, transA, ncol, nrow, alpha, cl_input_matrix, 0, lda, 0.0, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(float), output_matrix_f, 0, NULL, NULL); symmetrizeSquare_f(output_matrix_f, ncol); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, ncol * ncol * sizeof(double), output_matrix_d, 0, NULL, NULL); symmetrizeSquare_d(output_matrix_d, ncol); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrix); cl_input_matrix = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; // return status != CL_SUCCESS ? clblasErrorToString(status) : clErrorToString(err); return errorStatus; }
int main(int argc, char *argv[]) { clMath::Config cfg; cfg.setDefaultConfig("ktest.cfg"); if (!cfg.parseCommandLine(argc, argv) || !cfg.isSane()) { return 1; } clblasSetup(); parseEnvImplementation(); clMath::Step *masterStep = getMasterStep(cfg.blasFunctionID(), cfg.platform(), cfg.device()); if (masterStep == NULL) { std::cerr << "Function support not implemented yet" << std::endl; return 1; } CLBlasKargs kargs; SubproblemDim subdims[MAX_SUBDIMS]; cfg.kargs(&kargs); masterStep->setKargs(kargs); masterStep->fixLD(); ListHead seq; listInitHead(&seq); bool severalKernels = false; /* Single kernel for this function */ if (cfg.decomposition(subdims)) { masterStep->setDecomposition(subdims); } masterStep->completeDecompositionSingle(); if (cfg.permitMultiKernels()) { masterStep->makeSolutionSequence(&seq, getPlatform(cfg.platform().c_str())); if (listLength(&seq) > 1) { severalKernels = true; } } if (severalKernels) { std::ofstream fs; ListNode *node; std::vector<clMath::Step*> steps; masterStep->declareVars(NULL); for (node = listNodeFirst(&seq); node != &seq; node = node->next) { steps.push_back(getStep(node)); } std::string str; for (unsigned int i = 0; i < steps.size(); i++) { std::stringstream kernelFileName; kernelFileName << i << "_" << steps[i]->getBlasFunctionName() << "_" << cfg.cl(); steps[i]->setKernelName(kernelFileName.str()); if (cfg.decomposition(subdims)) { steps[i]->setDecomposition(subdims); } steps[i]->completeDecompositionSingle(); steps[i]->declareVars(masterStep); std::cout << "Generating '" << steps[i]->kernelName() << "' ..." << std::endl; str = steps[i]->generate(); if (str.empty()) { std::cerr << "failed" << std::endl; abort(); } fs.open(kernelFileName.str().c_str()); fs << str; fs.close(); } clMath::KTest *ktest = new clMath::KTest(masterStep, &steps, &cfg); std::cout << "Generating '" << cfg.cpp() << "' ..." << std::endl; str = ktest->generate(cfg.withAccuracy()); if (str.empty()) { std::cerr << "failed" << std::endl; abort(); } fs.open(cfg.cpp().c_str()); fs << str; fs.close(); delete ktest; for (std::vector<clMath::Step*>::iterator it = steps.begin(); it != steps.end(); ++it) { delete (*it); } steps.clear(); } else { std::ofstream fs; masterStep->setKernelName(cfg.cl()); std::cout << "Generating '" << masterStep->kernelName() << "' ..." << std::endl; masterStep->declareVars(NULL); std::string str; str = masterStep->generate(); if (str.empty()) { std::cerr << "failed" << std::endl; abort(); } fs.open(cfg.cl().c_str()); fs << str; fs.close(); clMath::KTest *ktest = new clMath::KTest(masterStep, &cfg); std::cout << "Generating '" << cfg.cpp() << "' ..." << std::endl; str = ktest->generate(cfg.withAccuracy()); if (str.empty()) { std::cerr << "failed" << std::endl; abort(); } fs.open(cfg.cpp().c_str()); fs << str; fs.close(); delete ktest; } if (cfg.permitMultiKernels()) { masterStep->freeSolutionSequence(&seq); } delete masterStep; return 0; }
void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) { // Prints the header of the output table PrintTableHeader(args); // Initializes OpenCL and the libraries auto platform = Platform(args.platform_id); auto device = Device(platform, args.device_id); auto context = Context(device); auto queue = Queue(context, device); #ifdef CLBLAST_REF_CLBLAS if (args.compare_clblas) { clblasSetup(); } #endif // Iterates over all "num_step" values jumping by "step" each time auto s = size_t{0}; while(true) { // Sets the buffer sizes (routine-specific) set_sizes(args); // Populates input host matrices with random data std::vector<T> x_source(args.x_size); std::vector<T> y_source(args.y_size); std::vector<T> a_source(args.a_size); std::vector<T> b_source(args.b_size); std::vector<T> c_source(args.c_size); std::vector<T> ap_source(args.ap_size); std::vector<T> scalar_source(args.scalar_size); PopulateVector(x_source); PopulateVector(y_source); PopulateVector(a_source); PopulateVector(b_source); PopulateVector(c_source); PopulateVector(ap_source); PopulateVector(scalar_source); // Creates the matrices on the device auto x_vec = Buffer<T>(context, args.x_size); auto y_vec = Buffer<T>(context, args.y_size); auto a_mat = Buffer<T>(context, args.a_size); auto b_mat = Buffer<T>(context, args.b_size); auto c_mat = Buffer<T>(context, args.c_size); auto ap_mat = Buffer<T>(context, args.ap_size); auto scalar = Buffer<T>(context, args.scalar_size); x_vec.Write(queue, args.x_size, x_source); y_vec.Write(queue, args.y_size, y_source); a_mat.Write(queue, args.a_size, a_source); b_mat.Write(queue, args.b_size, b_source); c_mat.Write(queue, args.c_size, c_source); ap_mat.Write(queue, args.ap_size, ap_source); scalar.Write(queue, args.scalar_size, scalar_source); auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar}; // Runs the routines and collects the timings auto timings = std::vector<std::pair<std::string, double>>(); auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast)); if (args.compare_clblas) { auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS"); timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas)); } if (args.compare_cblas) { auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS"); timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas)); } // Prints the performance of the tested libraries PrintTableRow(args, timings); // Makes the jump to the next step ++s; if (s >= args.num_steps) { break; } args.m += args.step; args.n += args.step; args.k += args.step; args.a_ld += args.step; args.b_ld += args.step; args.c_ld += args.step; } // Cleans-up and returns #ifdef CLBLAST_REF_CLBLAS if (args.compare_clblas) { clblasTeardown(); } #endif }
void CLHelper::Init(unsigned int platform_number, unsigned int device_number) { #ifdef BUILD_OPENCL cl_uint platform_count = 0; clGetPlatformIDs ( 0, 0, &platform_count ); if ( platform_count == 0 ) { FATAL ( "No OpenCL platforms detected!" ); } cl_platform_id* platform_ids = new cl_platform_id[platform_count]; clGetPlatformIDs ( platform_count, platform_ids, NULL ); cl_uint device_count = 0; clGetDeviceIDs ( platform_ids[platform_number], CL_DEVICE_TYPE_ALL, 0, NULL, &device_count ); if ( device_count == 0 ) { FATAL ( "No OpenCL devices detected!" ); } cl_device_id* device_ids = new cl_device_id[device_count]; clGetDeviceIDs ( platform_ids[platform_number], CL_DEVICE_TYPE_ALL, device_count, device_ids, NULL ); char device_name_buffer[256]; clGetDeviceInfo ( device_ids[device_number], CL_DEVICE_NAME, 256, device_name_buffer, 0 ); uint32_t support_buffer; clGetDeviceInfo ( device_ids[device_number], CL_DEVICE_IMAGE_SUPPORT, 4, &support_buffer, 0 ); LOGINFO << "Using OpenCL device: " << device_name_buffer; LOGDEBUG << "Image support: " << ( support_buffer ? "Yes" : "No" ); device = device_ids[device_number]; // Create context const cl_context_properties context_properties [] = { CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties> ( platform_ids [platform_number] ), 0, 0 }; LOGDEBUG << "Creating OpenCL context..."; cl_int error = 0; context = clCreateContext ( context_properties, 1, &device_ids[device_number], 0, 0, &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating OpenCL context: " << error ); } // Create command queue LOGDEBUG << "Creating OpenCL command queue..."; queue = clCreateCommandQueue ( context, device_ids[device_number], 0, &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating OpenCL command queue: " << error ); } delete[] device_ids; delete[] platform_ids; // Compile kernels cl_program p_crossCorrelation = CreateProgram ( "kernels/crossCorrelation.cl" ); cl_program p_biasedConvolution = CreateProgram ( "kernels/biasedConvolution.cl" ); cl_program p_fullConvolution = CreateProgram ( "kernels/fullConvolution.cl" ); cl_program p_foldWeights = CreateProgram ( "kernels/foldWeights.cl" ); cl_program p_biasedMatrixVector = CreateProgram ( "kernels/biasedMatrixVector.cl" ); cl_program p_biasGradient = CreateProgram ( "kernels/biasGradient.cl" ); cl_program p_matrixMatrix = CreateProgram ( "kernels/matrixMatrix.cl" ); cl_program p_maximum = CreateProgram ( "kernels/maximumPooling.cl" ); cl_program p_amaximum = CreateProgram ( "kernels/advmaximumPooling.cl" ); cl_program p_nonLinearFunctions = CreateProgram ( "kernels/nonLinearFunctions.cl" ); cl_program p_scaling = CreateProgram ( "kernels/scaling.cl" ); cl_program p_setValue = CreateProgram ( "kernels/setValue.cl" ); cl_program p_sms = CreateProgram ( "kernels/sms.cl" ); cl_program p_im2col = CreateProgram ( "kernels/im2col.cl" ); k_crossCorrelation = clCreateKernel ( p_crossCorrelation, "CROSS_CORRELATION", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_biasedConvolution = clCreateKernel ( p_biasedConvolution, "BIASED_CONVOLUTION", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_fullConvolution = clCreateKernel ( p_fullConvolution, "FULL_CONVOLUTION", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_foldWeights = clCreateKernel ( p_foldWeights, "FOLD_WEIGHTS", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_biasedMatrixVector = clCreateKernel ( p_biasedMatrixVector, "BIASED_MATRIX_VECTOR_FWD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_biasedMatrixVectorGrad = clCreateKernel ( p_biasedMatrixVector, "BIASED_MATRIX_VECTOR_GRAD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_biasedMatrixVectorBackward = clCreateKernel ( p_biasedMatrixVector, "BIASED_MATRIX_VECTOR_BWD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_biasGradientPart1 = clCreateKernel ( p_biasGradient, "BIAS_GRADIENT_PART1", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_biasGradientPart2 = clCreateKernel ( p_biasGradient, "BIAS_GRADIENT_PART2", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_matrixMatrix = clCreateKernel ( p_matrixMatrix, "MATRIX_MATRIX", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_maximumForward = clCreateKernel ( p_maximum, "MAXIMUM_POOLING_FWD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_maximumBackward = clCreateKernel ( p_maximum, "MAXIMUM_POOLING_BWD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_amaximumForward = clCreateKernel ( p_amaximum, "AMAXIMUM_POOLING_FWD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_amaximumBackward = clCreateKernel ( p_amaximum, "AMAXIMUM_POOLING_BWD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_nlTanh = clCreateKernel ( p_nonLinearFunctions, "NL_TANH_FWD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_nlTanhBackward = clCreateKernel ( p_nonLinearFunctions, "NL_TANH_BWD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_nlSigm = clCreateKernel ( p_nonLinearFunctions, "NL_SIGM_FWD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_nlSigmBackward = clCreateKernel ( p_nonLinearFunctions, "NL_SIGM_BWD", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_setValue = clCreateKernel ( p_setValue, "SET_VALUE", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_sms = clCreateKernel ( p_sms, "SMS", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_im2col = clCreateKernel ( p_im2col, "IM2COL", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_col2im = clCreateKernel ( p_im2col, "COL2IM", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_up = clCreateKernel ( p_scaling, "UP", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } k_down = clCreateKernel ( p_scaling, "DOWN", &error ); if ( error != CL_SUCCESS ) { FATAL ( "Error creating kernel: " << ( signed int ) error ); } #ifdef BUILD_CLBLAS cl_int err = clblasSetup(); if (err!=CL_SUCCESS) FATAL("Call to clblasSetup failed. Error: " << err); #endif #endif }