static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; clblasStatus err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, beta, C->buf, offC, ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; }
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event evl[3]; cl_event ev; size_t i; cl_uint num_ev = 0; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, beta, C[i]->buf, offC[i], ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; }
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, beta, C->buf, offC, ldc, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; }
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event evl[3]; cl_event ev; size_t i; cl_uint num_ev = 0; clblasStatus err; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, beta, C[i]->buf, offB[i], ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; }
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, beta, C[i]->buf, offC[i], ldc, &ctx->q, &ev)); ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; }
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cl_ctx *ctx = A[0]->ctx; cl_event ev; size_t i; StatusCode err; for (i = 0; i < batchCount; i++) { ARRAY_INIT(A[i]); ARRAY_INIT(B[i]); ARRAY_INIT(C[i]); err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb, beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A[i]); ARRAY_FINI(B[i]); ARRAY_FINI(C[i]); clReleaseEvent(ev); } return GA_NO_ERROR; }
static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, beta, C->buf, offC, ldc, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; }
static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(B); ARRAY_INIT(C); err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K, alpha, A->buf, offA, lda, B->buf, offB, ldb, beta, C->buf, offC, ldc, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A); ARRAY_FINI(B); ARRAY_FINI(C); clReleaseEvent(ev); return GA_NO_ERROR; }
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; gpudata *T; size_t t; cublasStatus_t err; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, ((double *)A->ptr) + offA, lda, ((double *)B->ptr) + offB, ldb, &beta, ((double *)C->ptr) + offC, ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(B, CUDA_WAIT_READ); cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); CLBT_CHECK(ctx->err, CLBlastHgemv(convO(order), convT(transA), M, N, float_to_half(alpha), A->buf, offA, lda, X->buf, offX, incX, float_to_half(beta), Y->buf, offY, incY, &ctx->q, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); ARRAY_FINI(Y); clReleaseEvent(ev); return GA_NO_ERROR; }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); CLB_CHECK(ctx->err, clblasDgemv(convO(order), convT(transA), M, N, alpha, A->buf, offA, lda, X->buf, offX, incX, beta, Y->buf, offY, incY, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev)); ARRAY_FINI(A); ARRAY_FINI(X); ARRAY_FINI(Y); clReleaseEvent(ev); return GA_NO_ERROR; }
static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, float beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; clblasStatus err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); err = clblasSgemv(convO(order), convT(transA), M, N, alpha, A->buf, offA, lda, X->buf, offX, incX, beta, Y->buf, offY, incY, 1, &ctx->q, num_ev, num_ev == 0 ? NULL : evl, &ev); if (err != clblasSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A); ARRAY_FINI(X); ARRAY_FINI(Y); clReleaseEvent(ev); return GA_NO_ERROR; }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cl_ctx *ctx = A->ctx; StatusCode err; cl_event ev; ARRAY_INIT(A); ARRAY_INIT(X); ARRAY_INIT(Y); err = CLBlastDgemv(convO(order), convT(transA), M, N, alpha, A->buf, offA, lda, X->buf, offX, incX, beta, Y->buf, offY, incY, &ctx->q, &ev); if (err != kSuccess) return GA_BLAS_ERROR; ARRAY_FINI(A); ARRAY_FINI(X); ARRAY_FINI(Y); clReleaseEvent(ev); return GA_NO_ERROR; }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL)); h->err = cublasDgemv(h->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; cublasStatus_t err; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemv(((blas_handle *)ctx->blas_handle)->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(X, CUDA_WAIT_READ); cuda_record(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; size_t *lt, t; gpudata **T; size_t i; cb_transpose transT; cublasStatus_t err; if (batchCount == 0) return GA_NO_ERROR; ASSERT_BUF(A[0]); ctx = A[0]->ctx; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products const size_t threshold = 650; const int multiple_dispatch = M * N * K > threshold * threshold * threshold; if (multiple_dispatch) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (double*)A[i]->ptr + offA[i], lda, (double*)B[i]->ptr + offB[i], ldb, &beta, (double*)C[i]->ptr + offC[i], ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } else { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **B_l = (const double **)T_l + batchCount; double **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); A_l[i] = ((double *)A[i]->ptr) + offA[i]; B_l[i] = ((double *)B[i]->ptr) + offB[i]; C_l[i] = ((double *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3); Aa = Ta; Ba = Ta + (batchCount * sizeof(double *)); Ca = Ta + (batchCount * sizeof(double *) * 2); cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3); err = cublasDgemmBatched(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (const double **)Aa, lda, (const double **)Ba, ldb, &beta, (double **)Ca, ldc, batchCount); cuMemFree(Ta); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } cuda_exit(ctx); return GA_NO_ERROR; }
static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { #ifdef HAVE_CUBLAS_SGEMMEX /* This will use float32 for computation as it's the best we can * have right now. In the future when native float16 support will be * there we will switch to that. */ cuda_context *ctx = A->ctx; gpudata *T; size_t t; cublasStatus_t err; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasSgemmEx(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, ((uint16_t *)A->ptr) + offA, CUBLAS_DATA_HALF, lda, ((uint16_t *)B->ptr) + offB, CUBLAS_DATA_HALF, ldb, &beta, ((uint16_t *)C->ptr) + offC, CUBLAS_DATA_HALF, ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(B, CUDA_WAIT_READ); cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; #else return GA_DEVSUP_ERROR; #endif }
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; blas_handle *h; size_t *lt, t; gpudata **T; size_t i; const size_t threshold = 650; cb_transpose transT; if (batchCount == 0) return GA_NO_ERROR; if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; ASSERT_BUF(A[0]); ctx = A[0]->ctx; h = (blas_handle *)ctx->blas_handle; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } /* use parallel cublasSgemm calls rather than cublasSgemmBatched for * large products */ if (M * N * K > threshold * threshold * threshold) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); h->err = cublasSgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, (float*)A[i]->ptr + offA[i], lda, (float*)B[i]->ptr + offB[i], ldb, &beta, (float*)C[i]->ptr + offC[i], ldc); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL)); } } else { float **T_l = alloca(sizeof(float *) * batchCount * 3); const float **A_l = (const float **)T_l; const float **B_l = (const float **)T_l + batchCount; float **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); A_l[i] = ((float *)A[i]->ptr) + offA[i]; B_l[i] = ((float *)B[i]->ptr) + offB[i]; C_l[i] = ((float *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(float *) * batchCount * 3); Aa = Ta; Ba = Ta + (batchCount * sizeof(float *)); Ca = Ta + (batchCount * sizeof(float *) * 2); cuMemcpyHtoD(Ta, T_l, sizeof(float *) * batchCount * 3); h->err = cublasSgemmBatched(h->h, convT(transA), convT(transB), M, N, K, &alpha, (const float **)Aa, lda, (const float **)Ba, ldb, &beta, (float **)Ca, ldc, batchCount); cuMemFree(Ta); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } for (i = 0; i < batchCount; i++) { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL)); } } cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *T; size_t t; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); h->err = cublasDgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((double *)A->ptr) + offA, lda, ((double *)B->ptr) + offB, ldb, &beta, ((double *)C->ptr) + offC, ldc); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }