static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); CLB_CHECK(ctx->err, clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, beta, C->buf, offC, ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; }
cl_int Dgemm_internal( cl_env *env, double *a, double *b, double *c, double alpha, double beta, clblasTranspose transA, clblasTranspose transB, int ar, int ac, int br, int bc, int cr, int cc, int size_a, int size_b, int size_c) { CHECK(clblasSetup()); cl_event events[NEVENTS]; int nevent = 0; cl_mem mem_a = create_mem(env, a, size_a, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_b = create_mem(env, b, size_b, CL_MEM_READ_ONLY, &(events[nevent++])); cl_mem mem_c; if (beta != 0) mem_c = create_mem(env, c, size_c, CL_MEM_READ_WRITE, &(events[nevent++])); else mem_c = create_mem(env, NULL, size_c, CL_MEM_READ_WRITE, NULL); cl_int err = clblasDgemm(clblasColumnMajor, transA, transB, ar, bc, ac, alpha, mem_a, 0, ar, mem_b, 0, br, beta, mem_c, 0, cr, 1, &(env->queues[0]), nevent, events, &(events[nevent])); CHECK(err); events[nevent+1] = *read_mem(env, mem_c, c, size_c, 1, &(events[nevent])); CHECK(clWaitForEvents(1, &(events[nevent+1]))); CHECK(clReleaseMemObject(mem_a)); CHECK(clReleaseMemObject(mem_b)); CHECK(clReleaseMemObject(mem_c)); clblasTeardown(); return CL_SUCCESS; }
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event evl[3]; cl_event ev; size_t i; cl_uint num_ev = 0; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); CLB_CHECK(ctx->err, clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; }
void xGemm<cl_double>:: xGemm_Function(bool flush, cl_uint apiCallCount ) { for (unsigned int i = 0; i < numQueues; i++) { events_[i] = NULL; } for (unsigned int i = 0; i < apiCallCount; i++) { clblasDgemm(order_, buffer_.trans_a_, buffer_.trans_b_, buffer_.m_, buffer_.n_, buffer_.k_, buffer_.alpha_, buffer_.buf_a_, buffer_.offA_, buffer_.lda_, buffer_.buf_b_, buffer_.offB_, buffer_.ldb_, buffer_.beta_, buffer_.buf_c_, buffer_.offC_, buffer_.ldc_, numQueuesToUse, queues_, 0, NULL, events_); } //flush==true if only the kernel time (library call) is timed //flush==false if memory time is also timed if (flush==true) { // check if any valid events returned cl_uint numValidEvents = 0; for (unsigned int i = 0; i < numQueuesToUse; i++) { if (events_[i]) { cl_uint clReferenceCount; cl_int err = clGetEventInfo(events_[i], CL_EVENT_REFERENCE_COUNT, sizeof(clReferenceCount), &clReferenceCount, NULL); if ( err == CL_SUCCESS) { //printf("events[%u/%u] has %u references\n", i, numQueuesToUse, clReferenceCount ); numValidEvents++; } else { //printf("events[%u/%u] invalid; err = %i\n", i, numQueuesToUse, err ); } } else { //printf("events[%u/%u] is NULL\n", i, numQueuesToUse ); } } for (unsigned int i = 0; i < numQueuesToUse; i++) { clFlush(queues_[i]); } clWaitForEvents(numValidEvents, events_); } }
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA, const void *inMatrixB, int nrowB, int ncolB, bool transposeB, double alpha, double beta, void *outMatrix, bool use_float) { std::stringstream result; float *input_matrixA_f = (float *)inMatrixA; float *input_matrixB_f = (float *)inMatrixB; float *output_matrix_f = (float *)outMatrix; double *input_matrixA_d = (double *)inMatrixA; double *input_matrixB_d = (double *)inMatrixB; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrixA = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixA:" << std::endl; } if (use_float) { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(float), input_matrixA_f, &err); } else { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(double), input_matrixA_d, &err); } } cl_mem cl_input_matrixB = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixB:" << std::endl; } if (use_float) { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(float), input_matrixB_f, &err); } else { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(double), input_matrixB_d, &err); } } int nrowC = transposeA ? ncolA : nrowA; int ncolC = transposeB ? nrowB : ncolB; cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const int lda = nrowA; // first dimension of A (rows), before any transpose const int ldb = nrowB; // first dimension of B (rows), before any transpose const int ldc = nrowC; // first dimension of C (rows) const int M = transposeA ? ncolA : nrowA; // rows in A (after transpose, if any) and C const int N = transposeB ? nrowB : ncolB; // cols in B (after transpose, if any) and C const int K = transposeA ? nrowA : ncolA; // cols in A and rows in B (after transposes, if any) const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans; const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSgemm:" << std::endl; } status = clblasSgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDgemm:" << std::endl; } status = clblasDgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrixA); cl_input_matrixA = NULL; clReleaseMemObject(cl_input_matrixB); cl_input_matrixB = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; return errorStatus; }