static int dger(cb_order order, size_t M, size_t N, double alpha,
                gpudata *X, size_t offX, int incX,
                gpudata *Y, size_t offY, int incY,
                gpudata *A, size_t offA, size_t lda) {
  cl_ctx *ctx = X->ctx;
  cl_event ev;
  StatusCode err;

  ARRAY_INIT(X);
  ARRAY_INIT(Y);
  ARRAY_INIT(A);

  err = CLBlastDger(convO(order), M, N, alpha, X->buf, offX, incX,
                    Y->buf, offY, incY, A->buf, offA, lda, &ctx->q, &ev);
  if (err != kSuccess)
    return GA_BLAS_ERROR;

  ARRAY_FINI(X);
  ARRAY_FINI(Y);
  ARRAY_FINI(A);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, float alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      float beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cl_ctx *ctx = A[0]->ctx;
  cl_event ev;
  size_t i;
  StatusCode err;

  for (i = 0; i < batchCount; i++) {
    ARRAY_INIT(A[i]);
    ARRAY_INIT(B[i]);
    ARRAY_INIT(C[i]);
    err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K,
                      alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
                      beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev);
    if (err != kSuccess)
      return GA_BLAS_ERROR;
    ARRAY_FINI(A[i]);
    ARRAY_FINI(B[i]);
    ARRAY_FINI(C[i]);
    clReleaseEvent(ev);
  }

  return GA_NO_ERROR;
}
static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 float alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX, float beta,
                 gpudata *Y, size_t offY, int incY) {
  cl_ctx *ctx = A->ctx;
  StatusCode err;
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(X);
  ARRAY_INIT(Y);

  err = CLBlastSgemv(convO(order), convT(transA), M, N, alpha,
                    A->buf, offA, lda, X->buf, offX, incX,
                    beta, Y->buf, offY, incY, &ctx->q, &ev);
  if (err != kSuccess)
    return GA_BLAS_ERROR;

  ARRAY_FINI(A);
  ARRAY_FINI(X);
  ARRAY_FINI(Y);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, float alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb, float beta,
                 gpudata *C, size_t offC, size_t ldc) {
  cl_ctx *ctx = A->ctx;
  StatusCode err;
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(B);
  ARRAY_INIT(C);

  err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K,
                    alpha, A->buf, offA, lda, B->buf, offB, ldb,
                    beta, C->buf, offC, ldc, &ctx->q, &ev);
  if (err != kSuccess)
    return GA_BLAS_ERROR;

  ARRAY_FINI(A);
  ARRAY_FINI(B);
  ARRAY_FINI(C);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int ddot(
        size_t N,
        gpudata *X, size_t offX, size_t incX,
        gpudata *Y, size_t offY, size_t incY,
        gpudata *Z, size_t offZ) {
  cl_ctx *ctx = X->ctx;
  StatusCode err;
  cl_event ev;

  ARRAY_INIT(X);
  ARRAY_INIT(Y);
  ARRAY_INIT(Z);

  err = CLBlastDdot(
          N,
          Z->buf, offZ,
          X->buf, offX, incX,
          Y->buf, offY, incY,
          &ctx->q, &ev);
  if (err != kSuccess)
      return GA_BLAS_ERROR;

  ARRAY_FINI(X);
  ARRAY_FINI(Y);
  ARRAY_FINI(Z);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int sger(cb_order order, size_t M, size_t N, float alpha,
                gpudata *X, size_t offX, int incX,
                gpudata *Y, size_t offY, int incY,
                gpudata *A, size_t offA, size_t lda) {
  cl_ctx *ctx = X->ctx;
  cl_event evl[3];
  cl_event ev;
  cl_uint num_ev = 0;
  clblasStatus err;

  ARRAY_INIT(X);
  ARRAY_INIT(Y);
  ARRAY_INIT(A);

  err = clblasSger(convO(order), M, N, alpha, X->buf, offX, incX,
                   Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q,
                   num_ev, num_ev == 0 ? NULL : evl, &ev);
  if (err != clblasSuccess)
    return GA_BLAS_ERROR;

  ARRAY_FINI(X);
  ARRAY_FINI(Y);
  ARRAY_FINI(A);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, double alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb, double beta,
                 gpudata *C, size_t offC, size_t ldc) {
  cl_ctx *ctx = A->ctx;
  clblasStatus err;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(B);
  ARRAY_INIT(C);

  err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K,
                    alpha, A->buf, offA, lda, B->buf, offB, ldb,
                    beta, C->buf, offC, ldc, 1, &ctx->q,
                    num_ev, num_ev == 0 ? NULL : evl, &ev);
  if (err != clblasSuccess)
    return GA_BLAS_ERROR;

  ARRAY_FINI(A);
  ARRAY_FINI(B);
  ARRAY_FINI(C);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 double alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX, double beta,
                 gpudata *Y, size_t offY, int incY) {
  cl_ctx *ctx = A->ctx;
  clblasStatus err;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(X);
  ARRAY_INIT(Y);

  err = clblasDgemv(convO(order), convT(transA), M, N, alpha,
                    A->buf, offA, lda, X->buf, offX, incX,
                    beta, Y->buf, offY, incY, 1, &ctx->q,
                    num_ev, num_ev == 0 ? NULL : evl, &ev);
  if (err != clblasSuccess)
    return GA_BLAS_ERROR;

  ARRAY_FINI(A);
  ARRAY_FINI(X);
  ARRAY_FINI(Y);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, float alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      float beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cl_ctx *ctx = A[0]->ctx;
  cl_event evl[3];
  cl_event ev;
  size_t i;
  cl_uint num_ev = 0;

  for (i = 0; i < batchCount; i++) {
    ARRAY_INIT(A[i]);
    ARRAY_INIT(B[i]);
    ARRAY_INIT(C[i]);
    CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB),
                                    M, N, K,
                                    alpha, A[i]->buf, offA[i], lda,
                                    B[i]->buf, offB[i], ldb,
                                    beta, C[i]->buf, offC[i], ldc, 1, &ctx->q,
                                    num_ev, num_ev == 0 ? NULL : evl, &ev));
    ARRAY_FINI(A[i]);
    ARRAY_FINI(B[i]);
    ARRAY_FINI(C[i]);
    clReleaseEvent(ev);
  }

  return GA_NO_ERROR;
}
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      double beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cl_ctx *ctx = A[0]->ctx;
  cl_event evl[3];
  cl_event ev;
  size_t i;
  cl_uint num_ev = 0;
  clblasStatus err;

  for (i = 0; i < batchCount; i++) {
    ARRAY_INIT(A[i]);
    ARRAY_INIT(B[i]);
    ARRAY_INIT(C[i]);
    err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K,
                      alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
                      beta, C[i]->buf, offB[i], ldc, 1, &ctx->q,
                      num_ev, num_ev == 0 ? NULL : evl, &ev);
    if (err != clblasSuccess)
      return GA_BLAS_ERROR;
    ARRAY_FINI(A[i]);
    ARRAY_FINI(B[i]);
    ARRAY_FINI(C[i]);
    clReleaseEvent(ev);
  }

  return GA_NO_ERROR;
}
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, double alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb, double beta,
                 gpudata *C, size_t offC, size_t ldc) {
  cl_ctx *ctx = A->ctx;
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(B);
  ARRAY_INIT(C);

  CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA), convT(transB),
                                    M, N, K, alpha,
                                    A->buf, offA, lda, B->buf, offB, ldb,
                                    beta, C->buf, offC, ldc, &ctx->q, &ev));

  ARRAY_FINI(A);
  ARRAY_FINI(B);
  ARRAY_FINI(C);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      double beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cl_ctx *ctx = A[0]->ctx;
  cl_event ev;
  size_t i;

  for (i = 0; i < batchCount; i++) {
    ARRAY_INIT(A[i]);
    ARRAY_INIT(B[i]);
    ARRAY_INIT(C[i]);
    CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA),
                                      convT(transB), M, N, K,
                                      alpha, A[i]->buf, offA[i], lda,
                                      B[i]->buf, offB[i], ldb, beta,
                                      C[i]->buf, offC[i], ldc, &ctx->q, &ev));
    ARRAY_FINI(A[i]);
    ARRAY_FINI(B[i]);
    ARRAY_FINI(C[i]);
    clReleaseEvent(ev);
  }

  return GA_NO_ERROR;
}
static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 float alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX, float beta,
                 gpudata *Y, size_t offY, int incY) {
  cl_ctx *ctx = A->ctx;
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(X);
  ARRAY_INIT(Y);

  CLBT_CHECK(ctx->err, CLBlastHgemv(convO(order), convT(transA), M, N,
                                    float_to_half(alpha),
                                    A->buf, offA, lda, X->buf, offX, incX,
                                    float_to_half(beta),
                                    Y->buf, offY, incY, &ctx->q, &ev));

  ARRAY_FINI(A);
  ARRAY_FINI(X);
  ARRAY_FINI(Y);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dger(cb_order order, size_t M, size_t N, double alpha,
                gpudata *X, size_t offX, int incX,
                gpudata *Y, size_t offY, int incY,
                gpudata *A, size_t offA, size_t lda) {
  cl_ctx *ctx = X->ctx;
  cl_event evl[3];
  cl_event ev;
  cl_uint num_ev = 0;

  ARRAY_INIT(X);
  ARRAY_INIT(Y);
  ARRAY_INIT(A);

  CLB_CHECK(ctx->err, clblasDger(convO(order), M, N, alpha, X->buf, offX, incX,
                                 Y->buf, offY, incY, A->buf, offA, lda, 1, &ctx->q,
                                 num_ev, num_ev == 0 ? NULL : evl, &ev));

  ARRAY_FINI(X);
  ARRAY_FINI(Y);
  ARRAY_FINI(A);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, float alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb, float beta,
                 gpudata *C, size_t offC, size_t ldc) {
  cl_ctx *ctx = A->ctx;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(B);
  ARRAY_INIT(C);

  CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB),
                                  M, N, K,
                                  alpha, A->buf, offA, lda, B->buf, offB, ldb,
                                  beta, C->buf, offC, ldc, 1, &ctx->q,
                                  num_ev, num_ev == 0 ? NULL : evl, &ev));

  ARRAY_FINI(A);
  ARRAY_FINI(B);
  ARRAY_FINI(C);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 float alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX, float beta,
                 gpudata *Y, size_t offY, int incY) {
  cl_ctx *ctx = A->ctx;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(X);
  ARRAY_INIT(Y);

  CLB_CHECK(ctx->err, clblasSgemv(convO(order), convT(transA), M, N, alpha,
                                  A->buf, offA, lda, X->buf, offX, incX,
                                  beta, Y->buf, offY, incY, 1, &ctx->q,
                                  num_ev, num_ev == 0 ? NULL : evl, &ev));

  ARRAY_FINI(A);
  ARRAY_FINI(X);
  ARRAY_FINI(Y);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int ddot(
        size_t N,
        gpudata *X, size_t offX, size_t incX,
        gpudata *Y, size_t offY, size_t incY,
        gpudata *Z, size_t offZ) {
  cl_ctx *ctx = X->ctx;
  clblasStatus err;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;
  gpudata *wbuf;

  wbuf = opencl_ops.buffer_alloc((gpucontext*)ctx,
                                 N*sizeof(double), NULL, GA_BUFFER_READ_WRITE);
  if (wbuf == NULL)
      return ctx->err->code;

  ARRAY_INIT(X);
  ARRAY_INIT(Y);
  ARRAY_INIT(Z);

  err = clblasDdot(
          N, Z->buf, offZ,
          X->buf, offX, incX,
          Y->buf, offY, incY,
          wbuf->buf, 1, &ctx->q,
          num_ev, num_ev ? evl : NULL, &ev);
  opencl_ops.buffer_release(wbuf);
  if (err != clblasSuccess)
    return error_clblas(ctx->err, "clblasDdot", err);

  ARRAY_FINI(X);
  ARRAY_FINI(Y);
  ARRAY_FINI(Z);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dger(cb_order order, size_t M, size_t N, double alpha,
                gpudata *X, size_t offX, int incX,
                gpudata *Y, size_t offY, int incY,
                gpudata *A, size_t offA, size_t lda) {
  cl_ctx *ctx = X->ctx;
  cl_event ev;

  ARRAY_INIT(X);
  ARRAY_INIT(Y);
  ARRAY_INIT(A);

  CLBT_CHECK(ctx->err, CLBlastDger(convO(order), M, N, alpha,
                                   X->buf, offX, incX, Y->buf, offY, incY,
                                   A->buf, offA, lda, &ctx->q, &ev));

  ARRAY_FINI(X);
  ARRAY_FINI(Y);
  ARRAY_FINI(A);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int ddot(
        size_t N,
        gpudata *X, size_t offX, size_t incX,
        gpudata *Y, size_t offY, size_t incY,
        gpudata *Z, size_t offZ) {
  cl_ctx *ctx = X->ctx;
  cl_event ev;

  ARRAY_INIT(X);
  ARRAY_INIT(Y);
  ARRAY_INIT(Z);

  CLBT_CHECK(ctx->err, CLBlastDdot(N, Z->buf, offZ, X->buf, offX, incX,
                                   Y->buf, offY, incY, &ctx->q, &ev));

  ARRAY_FINI(X);
  ARRAY_FINI(Y);
  ARRAY_FINI(Z);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}