static int ddot( size_t N, gpudata *X, size_t offX, size_t incX, gpudata *Y, size_t offY, size_t incY, gpudata *Z, size_t offZ) { cl_ctx *ctx = X->ctx; clblasStatus err; cl_uint num_ev = 0; cl_event evl[3]; cl_event ev; gpudata *wbuf; wbuf = opencl_ops.buffer_alloc((gpucontext*)ctx, N*sizeof(double), NULL, GA_BUFFER_READ_WRITE); if (wbuf == NULL) return ctx->err->code; ARRAY_INIT(X); ARRAY_INIT(Y); ARRAY_INIT(Z); err = clblasDdot( N, Z->buf, offZ, X->buf, offX, incX, Y->buf, offY, incY, wbuf->buf, 1, &ctx->q, num_ev, num_ev ? evl : NULL, &ev); opencl_ops.buffer_release(wbuf); if (err != clblasSuccess) return error_clblas(ctx->err, "clblasDdot", err); ARRAY_FINI(X); ARRAY_FINI(Y); ARRAY_FINI(Z); clReleaseEvent(ev); return GA_NO_ERROR; }
static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { cuda_context *ctx; size_t t, i; size_t ls[2], gs[2]; void *args[9]; gpudata *Aa, *xa, *ya; int err; if (flags != 0) return GA_INVALID_ERROR; if (batchCount == 0) return GA_NO_ERROR; if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR; if (M < 512) { ls[0] = 32; if (batchCount > 16) ls[1] = 16; else ls[1] = batchCount; } else { ls[0] = 512; ls[1] = 1; } gs[0] = (M + ls[0] - 1) / ls[0]; gs[1] = (batchCount + ls[1] - 1) / ls[1]; if (gs[0] * gs[1] / 65535) { gs[1] = (65535 / gs[0]); } if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } ASSERT_BUF(A[0]); ctx = A[0]->ctx; cuda_enter(ctx); { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **x_l = (const double **)T_l + batchCount; double **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(x[i], CUDA_WAIT_READ); cuda_wait(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); A_l[i] = (double *)(A[i]->ptr + offA[i]); x_l[i] = (double *)(x[i]->ptr + offX[i]); y_l[i] = (double *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l, GA_BUFFER_INIT, &err); if (Aa == NULL) return err; xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l, GA_BUFFER_INIT, &err); if (xa == NULL) { cuda_ops.buffer_release(Aa); return err; } ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l, GA_BUFFER_INIT, &err); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return err; } } args[0] = Aa; args[1] = &lda; args[2] = xa; args[3] = &incX; args[4] = ya; args[5] = &incY; args[6] = &batchCount; args[7] = &M; args[8] = &N; if (transA == cb_no_trans) { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args); } else { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args); } cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(x[i], CUDA_WAIT_READ); cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } cuda_exit(ctx); return GA_NO_ERROR; }
static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; size_t t, *tp, i; size_t ls[3] = {M, N, 1}, gs[3] = {1, 1, batchCount}; void *args[10]; gpudata **T; gpudata *Aa, *xa, *ya; int err; if (flags != 0) return GA_INVALID_ERROR; if (batchCount == 0) return GA_NO_ERROR; if (incX == 1) { if (ls[0] > 32) { gs[0] = (ls[0] + 31) / 32; ls[0] = 32; } if (ls[0] * ls[1] > 512) { gs[1] = (ls[1] + 15) / 16; ls[1] = 16; } } else { if (ls[1] > 32) { gs[1] = (ls[1] + 31) / 32; ls[1] = 32; } if (ls[0] * ls[1] > 512) { gs[0] = (ls[0] + 15) / 16; ls[0] = 16; } } if (gs[0] * gs[1] * gs[2] > 65535) { if (gs[0] * gs[1] > 65535) return GA_VALUE_ERROR; gs[2] = (65535 / (gs[0] * gs[1])); } if (order == cb_c) { t = M; M = N; N = t; tp = offX; offX = offY; offY = tp; t = incX; incX = incY; incY = t; T = x; x = y; y = T; } ASSERT_BUF(x[0]); ctx = x[0]->ctx; cuda_enter(ctx); { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **x_l = (const double **)T_l + batchCount; double **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); cuda_wait(A[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_wait(x[i], CUDA_WAIT_READ); cuda_wait(y[i], CUDA_WAIT_READ); A_l[i] = (double *)(A[i]->ptr + offA[i]); x_l[i] = (double *)(x[i]->ptr + offX[i]); y_l[i] = (double *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l, GA_BUFFER_INIT, &err); if (Aa == NULL) return err; xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l, GA_BUFFER_INIT, &err); if (xa == NULL) { cuda_ops.buffer_release(Aa); return err; } ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l, GA_BUFFER_INIT, &err); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return err; } } args[0] = xa; args[1] = &incX; args[2] = ya; args[3] = &incY; args[4] = α args[5] = Aa; args[6] = &lda; args[7] = &batchCount; args[8] = &M; args[9] = &N; err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args); cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(x[i], CUDA_WAIT_READ); cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } cuda_exit(ctx); return GA_NO_ERROR; }