static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, double alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb, double beta,
                 gpudata *C, size_t offC, size_t ldc) {
  cl_ctx *ctx = A->ctx;
  clblasStatus err;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(B);
  ARRAY_INIT(C);

  err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K,
                    alpha, A->buf, offA, lda, B->buf, offB, ldb,
                    beta, C->buf, offC, ldc, 1, &ctx->q,
                    num_ev, num_ev == 0 ? NULL : evl, &ev);
  if (err != clblasSuccess)
    return GA_BLAS_ERROR;

  ARRAY_FINI(A);
  ARRAY_FINI(B);
  ARRAY_FINI(C);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, float alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      float beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cl_ctx *ctx = A[0]->ctx;
  cl_event evl[3];
  cl_event ev;
  size_t i;
  cl_uint num_ev = 0;

  for (i = 0; i < batchCount; i++) {
    ARRAY_INIT(A[i]);
    ARRAY_INIT(B[i]);
    ARRAY_INIT(C[i]);
    CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB),
                                    M, N, K,
                                    alpha, A[i]->buf, offA[i], lda,
                                    B[i]->buf, offB[i], ldb,
                                    beta, C[i]->buf, offC[i], ldc, 1, &ctx->q,
                                    num_ev, num_ev == 0 ? NULL : evl, &ev));
    ARRAY_FINI(A[i]);
    ARRAY_FINI(B[i]);
    ARRAY_FINI(C[i]);
    clReleaseEvent(ev);
  }

  return GA_NO_ERROR;
}
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, double alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb, double beta,
                 gpudata *C, size_t offC, size_t ldc) {
  cl_ctx *ctx = A->ctx;
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(B);
  ARRAY_INIT(C);

  CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA), convT(transB),
                                    M, N, K, alpha,
                                    A->buf, offA, lda, B->buf, offB, ldb,
                                    beta, C->buf, offC, ldc, &ctx->q, &ev));

  ARRAY_FINI(A);
  ARRAY_FINI(B);
  ARRAY_FINI(C);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      double beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cl_ctx *ctx = A[0]->ctx;
  cl_event evl[3];
  cl_event ev;
  size_t i;
  cl_uint num_ev = 0;
  clblasStatus err;

  for (i = 0; i < batchCount; i++) {
    ARRAY_INIT(A[i]);
    ARRAY_INIT(B[i]);
    ARRAY_INIT(C[i]);
    err = clblasDgemm(convO(order), convT(transA), convT(transB), M, N, K,
                      alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
                      beta, C[i]->buf, offB[i], ldc, 1, &ctx->q,
                      num_ev, num_ev == 0 ? NULL : evl, &ev);
    if (err != clblasSuccess)
      return GA_BLAS_ERROR;
    ARRAY_FINI(A[i]);
    ARRAY_FINI(B[i]);
    ARRAY_FINI(C[i]);
    clReleaseEvent(ev);
  }

  return GA_NO_ERROR;
}
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      double beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cl_ctx *ctx = A[0]->ctx;
  cl_event ev;
  size_t i;

  for (i = 0; i < batchCount; i++) {
    ARRAY_INIT(A[i]);
    ARRAY_INIT(B[i]);
    ARRAY_INIT(C[i]);
    CLBT_CHECK(ctx->err, CLBlastDgemm(convO(order), convT(transA),
                                      convT(transB), M, N, K,
                                      alpha, A[i]->buf, offA[i], lda,
                                      B[i]->buf, offB[i], ldb, beta,
                                      C[i]->buf, offC[i], ldc, &ctx->q, &ev));
    ARRAY_FINI(A[i]);
    ARRAY_FINI(B[i]);
    ARRAY_FINI(C[i]);
    clReleaseEvent(ev);
  }

  return GA_NO_ERROR;
}
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, float alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      float beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cl_ctx *ctx = A[0]->ctx;
  cl_event ev;
  size_t i;
  StatusCode err;

  for (i = 0; i < batchCount; i++) {
    ARRAY_INIT(A[i]);
    ARRAY_INIT(B[i]);
    ARRAY_INIT(C[i]);
    err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K,
                      alpha, A[i]->buf, offA[i], lda, B[i]->buf, offB[i], ldb,
                      beta, C[i]->buf, offB[i], ldc, &ctx->q, &ev);
    if (err != kSuccess)
      return GA_BLAS_ERROR;
    ARRAY_FINI(A[i]);
    ARRAY_FINI(B[i]);
    ARRAY_FINI(C[i]);
    clReleaseEvent(ev);
  }

  return GA_NO_ERROR;
}
static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, float alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb, float beta,
                 gpudata *C, size_t offC, size_t ldc) {
  cl_ctx *ctx = A->ctx;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(B);
  ARRAY_INIT(C);

  CLB_CHECK(ctx->err, clblasSgemm(convO(order), convT(transA), convT(transB),
                                  M, N, K,
                                  alpha, A->buf, offA, lda, B->buf, offB, ldb,
                                  beta, C->buf, offC, ldc, 1, &ctx->q,
                                  num_ev, num_ev == 0 ? NULL : evl, &ev));

  ARRAY_FINI(A);
  ARRAY_FINI(B);
  ARRAY_FINI(C);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int sgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, float alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb, float beta,
                 gpudata *C, size_t offC, size_t ldc) {
  cl_ctx *ctx = A->ctx;
  StatusCode err;
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(B);
  ARRAY_INIT(C);

  err = CLBlastSgemm(convO(order), convT(transA), convT(transB), M, N, K,
                    alpha, A->buf, offA, lda, B->buf, offB, ldb,
                    beta, C->buf, offC, ldc, &ctx->q, &ev);
  if (err != kSuccess)
    return GA_BLAS_ERROR;

  ARRAY_FINI(A);
  ARRAY_FINI(B);
  ARRAY_FINI(C);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, double alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb,
                 double beta, gpudata *C, size_t offC, size_t ldc) {
  cuda_context *ctx = A->ctx;
  gpudata *T;
  size_t t;
  cublasStatus_t err;
  cb_transpose transT;

  ASSERT_BUF(A);
  ASSERT_BUF(B);
  ASSERT_BUF(C);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    t = offA;
    offA = offB;
    offB = t;
  }

  cuda_enter(ctx);

  cuda_wait(A, CUDA_WAIT_READ);
  cuda_wait(B, CUDA_WAIT_READ);
  cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h,
                    convT(transA), convT(transB), M, N, K,
                    &alpha, ((double *)A->ptr) + offA, lda,
                    ((double *)B->ptr) + offB, ldb, &beta,
                    ((double *)C->ptr) + offC, ldc);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  cuda_record(A, CUDA_WAIT_READ);
  cuda_record(B, CUDA_WAIT_READ);
  cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
static int hgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 float alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX, float beta,
                 gpudata *Y, size_t offY, int incY) {
  cl_ctx *ctx = A->ctx;
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(X);
  ARRAY_INIT(Y);

  CLBT_CHECK(ctx->err, CLBlastHgemv(convO(order), convT(transA), M, N,
                                    float_to_half(alpha),
                                    A->buf, offA, lda, X->buf, offX, incX,
                                    float_to_half(beta),
                                    Y->buf, offY, incY, &ctx->q, &ev));

  ARRAY_FINI(A);
  ARRAY_FINI(X);
  ARRAY_FINI(Y);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 double alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX, double beta,
                 gpudata *Y, size_t offY, int incY) {
  cl_ctx *ctx = A->ctx;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(X);
  ARRAY_INIT(Y);

  CLB_CHECK(ctx->err, clblasDgemv(convO(order), convT(transA), M, N, alpha,
                                  A->buf, offA, lda, X->buf, offX, incX,
                                  beta, Y->buf, offY, incY, 1, &ctx->q,
                                  num_ev, num_ev == 0 ? NULL : evl, &ev));

  ARRAY_FINI(A);
  ARRAY_FINI(X);
  ARRAY_FINI(Y);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int sgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 float alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX, float beta,
                 gpudata *Y, size_t offY, int incY) {
  cl_ctx *ctx = A->ctx;
  clblasStatus err;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(X);
  ARRAY_INIT(Y);

  err = clblasSgemv(convO(order), convT(transA), M, N, alpha,
                    A->buf, offA, lda, X->buf, offX, incX,
                    beta, Y->buf, offY, incY, 1, &ctx->q,
                    num_ev, num_ev == 0 ? NULL : evl, &ev);
  if (err != clblasSuccess)
    return GA_BLAS_ERROR;

  ARRAY_FINI(A);
  ARRAY_FINI(X);
  ARRAY_FINI(Y);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 double alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX, double beta,
                 gpudata *Y, size_t offY, int incY) {
  cl_ctx *ctx = A->ctx;
  StatusCode err;
  cl_event ev;

  ARRAY_INIT(A);
  ARRAY_INIT(X);
  ARRAY_INIT(Y);

  err = CLBlastDgemv(convO(order), convT(transA), M, N, alpha,
                    A->buf, offA, lda, X->buf, offX, incX,
                    beta, Y->buf, offY, incY, &ctx->q, &ev);
  if (err != kSuccess)
    return GA_BLAS_ERROR;

  ARRAY_FINI(A);
  ARRAY_FINI(X);
  ARRAY_FINI(Y);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 double alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX,
                 double beta, gpudata *Y, size_t offY, int incY) {
  cuda_context *ctx = A->ctx;
  blas_handle *h = (blas_handle *)ctx->blas_handle;
  size_t t;

  ASSERT_BUF(A);
  ASSERT_BUF(X);
  ASSERT_BUF(Y);

  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) ||
      LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY))
    return GA_XLARGE_ERROR;

  if (order == cb_c) {
    t = N;
    N = M;
    M = t;

    if (transA == cb_no_trans) {
      transA = cb_trans;
    } else {
      transA = cb_no_trans;
    }
  }

  cuda_enter(ctx);

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL));

  h->err = cublasDgemv(h->h,
                       convT(transA), M, N, &alpha,
                       ((double *)A->ptr) + offA, lda,
                       ((double *)X->ptr) + offX, incX,
                       &beta, ((double *)Y->ptr) + offY, incY);
  if (h->err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_ALL));

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N,
                 double alpha, gpudata *A, size_t offA, size_t lda,
                 gpudata *X, size_t offX, int incX,
                 double beta, gpudata *Y, size_t offY, int incY) {
  cuda_context *ctx = A->ctx;
  cublasStatus_t err;
  size_t t;

  ASSERT_BUF(A);
  ASSERT_BUF(X);
  ASSERT_BUF(Y);

  if (order == cb_c) {
    t = N;
    N = M;
    M = t;

    if (transA == cb_no_trans) {
      transA = cb_trans;
    } else {
      transA = cb_no_trans;
    }
  }

  cuda_enter(ctx);

  cuda_wait(A, CUDA_WAIT_READ);
  cuda_wait(X, CUDA_WAIT_READ);
  cuda_wait(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  err = cublasDgemv(((blas_handle *)ctx->blas_handle)->h,
                     convT(transA), M, N, &alpha,
                    ((double *)A->ptr) + offA, lda,
                    ((double *)X->ptr) + offX, incX,
                    &beta, ((double *)Y->ptr) + offY, incY);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  cuda_record(A, CUDA_WAIT_READ);
  cuda_record(X, CUDA_WAIT_READ);
  cuda_record(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  cuda_exit(ctx);

  return GA_NO_ERROR;
}
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      double beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cuda_context *ctx;
  size_t *lt, t;
  gpudata **T;
  size_t i;
  cb_transpose transT;
  cublasStatus_t err;

  if (batchCount == 0) return GA_NO_ERROR;

  ASSERT_BUF(A[0]);
  ctx = A[0]->ctx;
  cuda_enter(ctx);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    lt = offA;
    offA = offB;
    offB = lt;
  }

  // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products
  const size_t threshold = 650;
  const int multiple_dispatch = M * N * K > threshold * threshold * threshold;
  if (multiple_dispatch) {
    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(B[i], CUDA_WAIT_READ);
      cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);

      err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h,
                        convT(transA), convT(transB),
                        M, N, K, &alpha,
                        (double*)A[i]->ptr + offA[i], lda,
                        (double*)B[i]->ptr + offB[i], ldb,
                        &beta,
                        (double*)C[i]->ptr + offC[i], ldc);
      if (err != CUBLAS_STATUS_SUCCESS) {
        cuda_exit(ctx);
        if (err == CUBLAS_STATUS_ARCH_MISMATCH)
          return GA_DEVSUP_ERROR;
        return GA_BLAS_ERROR;
      }

      cuda_record(A[i], CUDA_WAIT_READ);
      cuda_record(B[i], CUDA_WAIT_READ);
      cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
    }
  } else {
    double **T_l = alloca(sizeof(double *) * batchCount * 3);
    const double **A_l = (const double **)T_l;
    const double **B_l = (const double **)T_l + batchCount;
    double **C_l = T_l + (batchCount * 2);
    CUdeviceptr Ta, Aa, Ba, Ca;

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(B[i], CUDA_WAIT_READ);
      cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      A_l[i] = ((double *)A[i]->ptr) + offA[i];
      B_l[i] = ((double *)B[i]->ptr) + offB[i];
      C_l[i] = ((double *)C[i]->ptr) + offC[i];
    }

    cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3);
    Aa = Ta;
    Ba = Ta + (batchCount * sizeof(double *));
    Ca = Ta + (batchCount * sizeof(double *) * 2);

    cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3);

    err = cublasDgemmBatched(((blas_handle *)ctx->blas_handle)->h,
                             convT(transA), convT(transB),
                             M, N, K, &alpha, (const double **)Aa, lda,
                             (const double **)Ba, ldb, &beta,
                             (double **)Ca, ldc, batchCount);
    cuMemFree(Ta);
    if (err != CUBLAS_STATUS_SUCCESS) {
      cuda_exit(ctx);
      if (err == CUBLAS_STATUS_ARCH_MISMATCH)
        return GA_DEVSUP_ERROR;
      return GA_BLAS_ERROR;
    }

    for (i = 0; i < batchCount; i++) {
      cuda_record(A[i], CUDA_WAIT_READ);
      cuda_record(B[i], CUDA_WAIT_READ);
      cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
    }
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, float alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb,
                 float beta, gpudata *C, size_t offC, size_t ldc) {
#ifdef HAVE_CUBLAS_SGEMMEX
  /* This will use float32 for computation as it's the best we can
   * have right now. In the future when native float16 support will be
   * there we will switch to that. */
  cuda_context *ctx = A->ctx;
  gpudata *T;
  size_t t;
  cublasStatus_t err;
  cb_transpose transT;

  ASSERT_BUF(A);
  ASSERT_BUF(B);
  ASSERT_BUF(C);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    t = offA;
    offA = offB;
    offB = t;
  }

  cuda_enter(ctx);

  cuda_wait(A, CUDA_WAIT_READ);
  cuda_wait(B, CUDA_WAIT_READ);
  cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  err = cublasSgemmEx(((blas_handle *)ctx->blas_handle)->h,
                      convT(transA), convT(transB), M, N, K,
                      &alpha,
                      ((uint16_t *)A->ptr) + offA, CUBLAS_DATA_HALF, lda,
                      ((uint16_t *)B->ptr) + offB, CUBLAS_DATA_HALF, ldb,
                      &beta,
                      ((uint16_t *)C->ptr) + offC, CUBLAS_DATA_HALF, ldc);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  cuda_record(A, CUDA_WAIT_READ);
  cuda_record(B, CUDA_WAIT_READ);
  cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE);

  cuda_exit(ctx);
  return GA_NO_ERROR;
#else
  return GA_DEVSUP_ERROR;
#endif
}
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB,
                      size_t M, size_t N, size_t K, float alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **B, size_t *offB, size_t ldb,
                      float beta, gpudata **C, size_t *offC, size_t ldc,
                      size_t batchCount) {
  cuda_context *ctx;
  blas_handle *h;
  size_t *lt, t;
  gpudata **T;
  size_t i;
  const size_t threshold = 650;
  cb_transpose transT;

  if (batchCount == 0) return GA_NO_ERROR;

  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
    return GA_XLARGE_ERROR;

  ASSERT_BUF(A[0]);
  ctx = A[0]->ctx;
  h = (blas_handle *)ctx->blas_handle;
  cuda_enter(ctx);

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    lt = offA;
    offA = offB;
    offB = lt;
  }

  /* use parallel cublasSgemm calls rather than cublasSgemmBatched for
   * large products */
  if (M * N * K > threshold * threshold * threshold) {
    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL));

      h->err = cublasSgemm(h->h,
                           convT(transA), convT(transB),
                           M, N, K, &alpha,
                           (float*)A[i]->ptr + offA[i], lda,
                           (float*)B[i]->ptr + offB[i], ldb,
                           &beta,
                           (float*)C[i]->ptr + offC[i], ldc);
      if (h->err != CUBLAS_STATUS_SUCCESS) {
        cuda_exit(ctx);
        if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
          return GA_DEVSUP_ERROR;
        return GA_BLAS_ERROR;
      }

      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL));
    }
  } else {
    float **T_l = alloca(sizeof(float *) * batchCount * 3);
    const float **A_l = (const float **)T_l;
    const float **B_l = (const float **)T_l + batchCount;
    float **C_l = T_l + (batchCount * 2);
    CUdeviceptr Ta, Aa, Ba, Ca;

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(B[i]);
      ASSERT_BUF(C[i]);
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL));
      A_l[i] = ((float *)A[i]->ptr) + offA[i];
      B_l[i] = ((float *)B[i]->ptr) + offB[i];
      C_l[i] = ((float *)C[i]->ptr) + offC[i];
    }

    cuMemAlloc(&Ta, sizeof(float *) * batchCount * 3);
    Aa = Ta;
    Ba = Ta + (batchCount * sizeof(float *));
    Ca = Ta + (batchCount * sizeof(float *) * 2);

    cuMemcpyHtoD(Ta, T_l, sizeof(float *) * batchCount * 3);

    h->err = cublasSgemmBatched(h->h,
                                convT(transA), convT(transB),
                                M, N, K, &alpha,
                                (const float **)Aa, lda,
                                (const float **)Ba, ldb, &beta,
                                (float **)Ca, ldc, batchCount);
    cuMemFree(Ta);
    if (h->err != CUBLAS_STATUS_SUCCESS) {
      cuda_exit(ctx);
      if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
        return GA_DEVSUP_ERROR;
      return GA_BLAS_ERROR;
    }

    for (i = 0; i < batchCount; i++) {
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ));
      GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL));
    }
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB,
                 size_t M, size_t N, size_t K, double alpha,
                 gpudata *A, size_t offA, size_t lda,
                 gpudata *B, size_t offB, size_t ldb,
                 double beta, gpudata *C, size_t offC, size_t ldc) {
  cuda_context *ctx = A->ctx;
  blas_handle *h = (blas_handle *)ctx->blas_handle;
  gpudata *T;
  size_t t;
  cb_transpose transT;

  ASSERT_BUF(A);
  ASSERT_BUF(B);
  ASSERT_BUF(C);

  if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) ||
      LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) ||
      LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N))
    return GA_XLARGE_ERROR;

  if (order == cb_c) {
    /* swap A and B */
    t = N;
    N = M;
    M = t;
    T = A;
    A = B;
    B = T;
    t = lda;
    lda = ldb;
    ldb = t;
    transT = transA;
    transA = transB;
    transB = transT;
    t = offA;
    offA = offB;
    offB = t;
  }

  cuda_enter(ctx);

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL));

  h->err = cublasDgemm(h->h,
                       convT(transA), convT(transB), M, N, K,
                       &alpha, ((double *)A->ptr) + offA, lda,
                       ((double *)B->ptr) + offB, ldb, &beta,
                       ((double *)C->ptr) + offC, ldc);
  if (h->err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    if (h->err == CUBLAS_STATUS_ARCH_MISMATCH)
      return GA_DEVSUP_ERROR;
    return GA_BLAS_ERROR;
  }

  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ));
  GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL));

  cuda_exit(ctx);
  return GA_NO_ERROR;
}