virtual ~clblasFunc() { clblasTeardown(); OPENCL_V_THROW( clReleaseCommandQueue(queue_), "releasing command queue" ); OPENCL_V_THROW( clReleaseContext(ctx_), "releasing context" ); }
cl_int Dgemm_internal( cl_env *env, double *a, double *b, double *c, double alpha, double beta, clblasTranspose transA, clblasTranspose transB, int ar, int ac, int br, int bc, int cr, int cc, int size_a, int size_b, int size_c) { CHECK(clblasSetup()); cl_event events[NEVENTS]; int nevent = 0; cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_b = create_mem(env, b, size_b, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_c; if (beta != 0) mem_c = create_mem(env, c, size_c, CL_MEM_READ_WRITE, &(events[nevent++])); else mem_c = create_mem(env, NULL, size_c, CL_MEM_READ_WRITE, NULL); cl_int err = clblasDgemm(clblasColumnMajor, transA, transB, ar, bc, ac, alpha, mem_a, 0, ar, mem_b, 0, br, beta, mem_c, 0, cr, 1, &(env->queues[0]), nevent, events, &(events[nevent])); CHECK(err); events[nevent+1] = *read_mem(env, mem_c, c, size_c, 1, &(events[nevent])); CHECK(clWaitForEvents(1, &(events[nevent+1]))); CHECK(clReleaseMemObject(mem_a)); CHECK(clReleaseMemObject(mem_b)); CHECK(clReleaseMemObject(mem_c)); clblasTeardown(); return CL_SUCCESS; }
// -------------------- extern "C" magma_int_t magma_finalize() { clblasTeardown(); g_runtime.quit(); return MAGMA_SUCCESS; }
static void teardown(gpucontext *ctx) { if (ctx->blas_handle != NULL) { ctx->blas_handle = NULL; refcnt--; } if (refcnt == 0) clblasTeardown(); }
static void teardown(void *c) { cl_ctx *ctx = (cl_ctx *)c; if (ctx->blas_handle != NULL) { ctx->blas_handle = NULL; refcnt--; } if (refcnt == 0) clblasTeardown(); }
SpatialSEIR::OCLProvider::~OCLProvider() { clblasTeardown(); delete currentPlatform; delete currentContext; delete currentDevice; delete platforms; delete R_star_args; delete p_se_args; delete test_kernel; delete R_Star_kernel; delete p_se_kernel1; delete p_se_kernel2; delete isSetup; }
cl_int Dtrmm_internal( cl_env *env, double *a, double *b, double alpha, clblasSide side, clblasTranspose transA, clblasUplo uplo, clblasDiag diag, int ar, int ac, int br, int bc, int size_a, int size_b) { CHECK(clblasSetup()); cl_event events[NEVENTS]; int nevent = 0; cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_b = create_mem(env, b, size_b, CL_MEM_READ_WRITE, &(events[nevent++])); cl_int err = clblasDtrmm(clblasColumnMajor, side, uplo, transA, diag, br, bc, alpha, mem_a, 0, ar, mem_b, 0, br, 1, &(env->queues[0]), nevent, events, &(events[nevent])); CHECK(err); events[nevent+1] = *read_mem(env, mem_b, b, size_b, 1, &(events[nevent])); CHECK(clWaitForEvents(1, &(events[nevent+1]))); CHECK(clReleaseMemObject(mem_a)); CHECK(clReleaseMemObject(mem_b)); clblasTeardown(); return CL_SUCCESS; }
cl_int Dsyrk_internal( cl_env *env, double *a, double *c, double alpha, double beta, clblasTranspose transA, clblasUplo uplo, int ar, int ac, int n, int size_a, int size_c) { CHECK(clblasSetup()); cl_event events[NEVENTS]; int nevent = 0; cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_c; if (beta != 0) mem_c = create_mem(env, c, size_c, CL_MEM_READ_WRITE, &(events[nevent++])); else mem_c = create_mem(env, NULL, size_c, CL_MEM_READ_WRITE, NULL); int k = transA == clblasNoTrans ? ar : ac; cl_int err = clblasDsyrk(clblasColumnMajor, uplo, transA, n, k, alpha, mem_a, 0, ac, beta, mem_c, 0, n, 1, &(env->queues[0]), nevent, events, &(events[nevent])); CHECK(err); events[nevent+1] = *read_mem(env, mem_c, c, size_c, 1, &(events[nevent])); CHECK(clWaitForEvents(1, &(events[nevent+1]))); CHECK(clReleaseMemObject(mem_a)); CHECK(clReleaseMemObject(mem_c)); clblasTeardown(); return CL_SUCCESS; }
void Client<T,U>::PerformanceTest(Arguments<U> &args, const SetMetric set_sizes) { // Prints the header of the output table PrintTableHeader(args); // Initializes OpenCL and the libraries auto platform = Platform(args.platform_id); auto device = Device(platform, args.device_id); auto context = Context(device); auto queue = Queue(context, device); #ifdef CLBLAST_REF_CLBLAS if (args.compare_clblas) { clblasSetup(); } #endif // Iterates over all "num_step" values jumping by "step" each time auto s = size_t{0}; while(true) { // Sets the buffer sizes (routine-specific) set_sizes(args); // Populates input host matrices with random data std::vector<T> x_source(args.x_size); std::vector<T> y_source(args.y_size); std::vector<T> a_source(args.a_size); std::vector<T> b_source(args.b_size); std::vector<T> c_source(args.c_size); std::vector<T> ap_source(args.ap_size); std::vector<T> scalar_source(args.scalar_size); PopulateVector(x_source); PopulateVector(y_source); PopulateVector(a_source); PopulateVector(b_source); PopulateVector(c_source); PopulateVector(ap_source); PopulateVector(scalar_source); // Creates the matrices on the device auto x_vec = Buffer<T>(context, args.x_size); auto y_vec = Buffer<T>(context, args.y_size); auto a_mat = Buffer<T>(context, args.a_size); auto b_mat = Buffer<T>(context, args.b_size); auto c_mat = Buffer<T>(context, args.c_size); auto ap_mat = Buffer<T>(context, args.ap_size); auto scalar = Buffer<T>(context, args.scalar_size); x_vec.Write(queue, args.x_size, x_source); y_vec.Write(queue, args.y_size, y_source); a_mat.Write(queue, args.a_size, a_source); b_mat.Write(queue, args.b_size, b_source); c_mat.Write(queue, args.c_size, c_source); ap_mat.Write(queue, args.ap_size, ap_source); scalar.Write(queue, args.scalar_size, scalar_source); auto buffers = Buffers<T>{x_vec, y_vec, a_mat, b_mat, c_mat, ap_mat, scalar}; // Runs the routines and collects the timings auto timings = std::vector<std::pair<std::string, double>>(); auto ms_clblast = TimedExecution(args.num_runs, args, buffers, queue, run_routine_, "CLBlast"); timings.push_back(std::pair<std::string, double>("CLBlast", ms_clblast)); if (args.compare_clblas) { auto ms_clblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference1_, "clBLAS"); timings.push_back(std::pair<std::string, double>("clBLAS", ms_clblas)); } if (args.compare_cblas) { auto ms_cblas = TimedExecution(args.num_runs, args, buffers, queue, run_reference2_, "CPU BLAS"); timings.push_back(std::pair<std::string, double>("CPU BLAS", ms_cblas)); } // Prints the performance of the tested libraries PrintTableRow(args, timings); // Makes the jump to the next step ++s; if (s >= args.num_steps) { break; } args.m += args.step; args.n += args.step; args.k += args.step; args.a_ld += args.step; args.b_ld += args.step; args.c_ld += args.step; } // Cleans-up and returns #ifdef CLBLAST_REF_CLBLAS if (args.compare_clblas) { clblasTeardown(); } #endif }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); printResult(); /* Call clblas function. */ err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event); // printf("here\n"); if (err != CL_SUCCESS) { printf("clblasSrot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROT placed in vector Y. */ printResult(); } //printf("here\n"); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufC, bufB; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * K * sizeof(*B), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * N * sizeof(*C), NULL, &err); if ((bufA == NULL) || (bufC == NULL) || (bufB == NULL)) { printf("Failed to create buffern"); return 1; } err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, N * K * sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas function. */ err = clblasCher2k(order, uplo, transA, N, K, alpha, bufA, 0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasCher2k() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, N * N * sizeof(*C), C, 0, NULL, NULL); /* At this point you will get the result of SSYRK placed in C array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY, bufDotP, scratchBuff; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenY*sizeof(cl_float)), NULL, &err); // Allocate 1 element space for dotProduct bufDotP = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err); // Allocate minimum of N elements scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasSdot( N, bufDotP, 0, bufX, 0, incx, bufY, 0, incy, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSdot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufDotP, CL_TRUE, 0, sizeof(cl_float), &dotProduct, 0, NULL, NULL); printf("Result dot product: %f\n", dotProduct); } /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufDotP); clReleaseMemObject(scratchBuff); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX, bufY; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(*A), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(*X), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(*Y), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(*X), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*Y), Y, 0, NULL, NULL); /* Call clblas function. */ err = clblasZhemv(order, uplo, N, alpha, bufA, 0 /*offA */, lda, bufX, 0 /*offx*/, incx, beta, bufY, 0 /*offx*/, incy, 1, &queue, 0, NULL, &event); // blasZhemv(order, uplo, N, alpha, (DoubleComplex*)A, 0, lda, (DoubleComplex*)X, 0, incx, beta, (DoubleComplex*)Y, 0, incy); // err = CL_SUCCESS; //err = clblasZtrmv(order, uplo, clblasNoTrans, clblasNonUnit, N, bufA, 0 /*offA */, lda, // bufX, 0 /*offx*/, incx, // bufY, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasZhemv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); printResult(); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(*Y), Y, 0, NULL, NULL); /* At this point you will get the result of SSYMV placed in Y array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
void ShutdownJTorch() { std::lock_guard<std::mutex> lck(cl_context_lock_); clblasTeardown(); cl_context.reset(nullptr); }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX, bufY; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numElementsAP * sizeof(cl_float)), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_float), Y, 0, NULL, NULL); err = clblasSspr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy, bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSspr2() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_float)), AP, 0, NULL, NULL); /* At this point you will get the result of SSPR2 placed in A array. */ printResult(); } /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufAP); clReleaseMemObject(bufY); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } // print device name int valueSize=0; clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &valueSize); char * value = (char*) malloc(valueSize); clGetDeviceInfo(device, CL_DEVICE_NAME, valueSize, value, NULL); printf("Device: %sn\n", value); free(value); props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K * N * sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas extended function. Perform gemm for the lower right sub-matrices */ err = clblasSgemm(order, transA, transB, M - off, N - off, K - off, alpha, bufA, offA, lda, bufB, offB, ldb, beta, bufC, offC, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSgemmEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of SGEMM placed in 'result' array. */ puts(""); printResult("clblasSgemmEx result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clBLAS */ err = clblasSetup( ); /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A), NULL, &err ); bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B), NULL, &err ); bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err ); err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0, M * K * sizeof( *A ), A, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0, K * N * sizeof( *B ), B, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0, M * N * sizeof( *C ), C, 0, NULL, NULL ); /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */ err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, M, N, K, alpha, bufA, 0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event ); /* Wait for calculations to be finished. */ err = clWaitForEvents( 1, &event ); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL ); /* Release OpenCL memory objects. */ clReleaseMemObject( bufC ); clReleaseMemObject( bufB ); clReleaseMemObject( bufA ); /* Finalize work with clBLAS */ clblasTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }