static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, beta, C->buf, offC, ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; }
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event evl[3]; cl_event ev; size_t i; cl_uint num_ev = 0; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; }
void xGemm<cl_float>:: xGemm_Function(bool flush, cl_uint apiCallCount ) { for (unsigned int i = 0; i < numQueues; i++) { events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) { clblasSgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } //flush==true if only the kernel time (library call) is timed //flush==false if memory time is also timed if (flush==true) { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { if (events_[i]) { cl_uint clReferenceCount; cl_int err = clGetEventInfo(events_[i], CL_EVENT_REFERENCE_COUNT, sizeof(clReferenceCount), &clReferenceCount, NULL); if ( err == CL_SUCCESS) { //printf("events[%u/%u] has %u references\n", i, numQueuesToUse, clReferenceCount ); numValidEvents++; } else { //printf("events[%u/%u] invalid; err = %i\n", i, numQueuesToUse, err ); } } else { //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } clWaitForEvents(numValidEvents, events_); } }
void mat_mul_cl_clblas(const F *A, const F *B, F *C, size_t n, Cache *cache) { cl_event event; size_t mat_sizeof; mat_sizeof = n * n * sizeof(F); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, n, n, n, 1.0, cache->buf_a, 0, n, cache->buf_b, 0, n, 0.0, cache->buf_c, 0, n, 1, &(cache->common.command_queue), 0, NULL, &event ); clWaitForEvents(1, &event); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } // print device name int valueSize=0; clGetDeviceInfo(device, CL_DEVICE_NAME, 0, NULL, &valueSize); char * value = (char*) malloc(valueSize); clGetDeviceInfo(device, CL_DEVICE_NAME, valueSize, value, NULL); printf("Device: %sn\n", value); free(value); props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A), NULL, &err); bufB = clCreateBuffer(ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B), NULL, &err); bufC = clCreateBuffer(ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, M * K * sizeof(*A), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0, K * N * sizeof(*B), B, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*C), C, 0, NULL, NULL); /* Call clblas extended function. Perform gemm for the lower right sub-matrices */ err = clblasSgemm(order, transA, transB, M - off, N - off, K - off, alpha, bufA, offA, lda, bufB, offB, ldb, beta, bufC, offC, ldc, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSgemmEx() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL); /* At this point you will get the result of SGEMM placed in 'result' array. */ puts(""); printResult("clblasSgemmEx result"); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufC); clReleaseMemObject(bufB); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
void TensorMath::GEMM(const bool is_row_major, const bool transpose_A, const bool transpose_B, const int M, const int N, const int K, const datum alpha, const Conv::Tensor &A, const int smA, const int ldA, const Conv::Tensor &B, const int smB, const int ldB, const datum beta, Conv::Tensor &C, const int smC, const int ldC) { #ifdef BUILD_CLBLAS ((Tensor&)A).MoveToGPU(); ((Tensor&)B).MoveToGPU(); C.MoveToGPU(C.hint_ignore_content_ && beta == 0.0); cl_event done_event = NULL; const int offA = A.width() * A.height() * A.maps() * smA; const int offB = B.width() * B.height() * B.maps() * smB; const int offC = C.width() * C.height() * C.maps() * smC; cl_int err = clblasSgemm(is_row_major ? clblasRowMajor : clblasColumnMajor, transpose_A ? clblasTrans : clblasNoTrans, transpose_B ? clblasTrans : clblasNoTrans, M, N, K, alpha, (cl_mem)A.cl_data_ptr_, offA, ldA, (cl_mem)B.cl_data_ptr_, offB, ldB, beta, (cl_mem)C.cl_data_ptr_, offC, ldC, 1, &(CLHelper::queue), 0, NULL, &done_event); if(err!=CL_SUCCESS) FATAL("Call to clblasSgemm failed. Error: " << err); #else #ifdef BUILD_OPENCL ((Tensor&)A).MoveToCPU(); ((Tensor&)B).MoveToCPU(); C.MoveToCPU(C.hint_ignore_content_ && beta == 0.0); #endif #ifdef BUILD_BLAS INNERGEMM(is_row_major ? CblasRowMajor : CblasColMajor, transpose_A ? CblasTrans : CblasNoTrans, transpose_B ? CblasTrans : CblasNoTrans, M, N, K, alpha, A.data_ptr_const(0,0,0,smA), ldA, B.data_ptr_const(0,0,0,smB), ldB, beta, C.data_ptr(0,0,0,smC), ldC); #else if(!is_row_major) FATAL("Reference GEMM does not support column-major matrices!"); const datum* a_ptr = A.data_ptr_const(0, 0, 0, smA); const datum* b_ptr = B.data_ptr_const(0, 0, 0, smB); datum* c_ptr = C.data_ptr(0, 0, 0, smC); #pragma omp parallel for default(shared) for(int i = 0; i < M; i++) { for(int j = 0; j < N; j++) { datum sum = 0.0; for(int k = 0; k < K; k++) { const datum a_value = transpose_A ? a_ptr[k * ldA + i] : a_ptr[i * ldA + k]; const datum b_value = transpose_B ? b_ptr[j * ldB + k] : b_ptr[k * ldB + j]; sum += a_value * b_value; } if(beta == 0.0) c_ptr[ldC * i + j] = alpha * sum; else c_ptr[ldC * i + j] = beta * c_ptr[ldC * i + j] + alpha * sum; } } #endif // BUILD_BLAS #endif // BUILD_CLBLAS C.hint_ignore_content_ = false; }
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA, const void *inMatrixB, int nrowB, int ncolB, bool transposeB, double alpha, double beta, void *outMatrix, bool use_float) { std::stringstream result; float *input_matrixA_f = (float *)inMatrixA; float *input_matrixB_f = (float *)inMatrixB; float *output_matrix_f = (float *)outMatrix; double *input_matrixA_d = (double *)inMatrixA; double *input_matrixB_d = (double *)inMatrixB; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrixA = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixA:" << std::endl; } if (use_float) { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(float), input_matrixA_f, &err); } else { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(double), input_matrixA_d, &err); } } cl_mem cl_input_matrixB = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixB:" << std::endl; } if (use_float) { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(float), input_matrixB_f, &err); } else { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(double), input_matrixB_d, &err); } } int nrowC = transposeA ? ncolA : nrowA; int ncolC = transposeB ? nrowB : ncolB; cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const int lda = nrowA; // first dimension of A (rows), before any transpose const int ldb = nrowB; // first dimension of B (rows), before any transpose const int ldc = nrowC; // first dimension of C (rows) const int M = transposeA ? ncolA : nrowA; // rows in A (after transpose, if any) and C const int N = transposeB ? nrowB : ncolB; // cols in B (after transpose, if any) and C const int K = transposeA ? nrowA : ncolA; // cols in A and rows in B (after transposes, if any) const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans; const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSgemm:" << std::endl; } status = clblasSgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDgemm:" << std::endl; } status = clblasDgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrixA); cl_input_matrixA = NULL; clReleaseMemObject(cl_input_matrixB); cl_input_matrixB = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; return errorStatus; }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clBLAS */ err = clblasSetup( ); /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A), NULL, &err ); bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B), NULL, &err ); bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err ); err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0, M * K * sizeof( *A ), A, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0, K * N * sizeof( *B ), B, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0, M * N * sizeof( *C ), C, 0, NULL, NULL ); /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */ err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, M, N, K, alpha, bufA, 0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event ); /* Wait for calculations to be finished. */ err = clWaitForEvents( 1, &event ); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL ); /* Release OpenCL memory objects. */ clReleaseMemObject( bufC ); clReleaseMemObject( bufB ); clReleaseMemObject( bufA ); /* Finalize work with clBLAS */ clblasTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }