static int ddot(
        size_t N,
        gpudata *X, size_t offX, size_t incX,
        gpudata *Y, size_t offY, size_t incY,
        gpudata *Z, size_t offZ) {
  cl_ctx *ctx = X->ctx;
  clblasStatus err;
  cl_uint num_ev = 0;
  cl_event evl[3];
  cl_event ev;
  gpudata *wbuf;

  wbuf = opencl_ops.buffer_alloc((gpucontext*)ctx,
                                 N*sizeof(double), NULL, GA_BUFFER_READ_WRITE);
  if (wbuf == NULL)
      return ctx->err->code;

  ARRAY_INIT(X);
  ARRAY_INIT(Y);
  ARRAY_INIT(Z);

  err = clblasDdot(
          N, Z->buf, offZ,
          X->buf, offX, incX,
          Y->buf, offY, incY,
          wbuf->buf, 1, &ctx->q,
          num_ev, num_ev ? evl : NULL, &ev);
  opencl_ops.buffer_release(wbuf);
  if (err != clblasSuccess)
    return error_clblas(ctx->err, "clblasDdot", err);

  ARRAY_FINI(X);
  ARRAY_FINI(Y);
  ARRAY_FINI(Z);

  clReleaseEvent(ev);

  return GA_NO_ERROR;
}
static int dgemvBatch(cb_order order, cb_transpose transA,
                      size_t M, size_t N, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **x, size_t *offX, size_t incX,
                      double beta, gpudata **y, size_t *offY, size_t incY,
                      size_t batchCount, int flags) {
  cuda_context *ctx;
  size_t t, i;
  size_t ls[2], gs[2];
  void *args[9];
  gpudata *Aa, *xa, *ya;
  int err;

  if (flags != 0) return GA_INVALID_ERROR;
  if (batchCount == 0) return GA_NO_ERROR;

  if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR;

  if (M < 512) {
    ls[0] = 32;
    if (batchCount > 16)
      ls[1] = 16;
    else
      ls[1] = batchCount;
  } else {
    ls[0] = 512;
    ls[1] = 1;
  }
  gs[0] = (M + ls[0] - 1) / ls[0];
  gs[1] = (batchCount + ls[1] - 1) / ls[1];
  if (gs[0] * gs[1] / 65535) {
    gs[1] = (65535 / gs[0]);
  }

  if (order == cb_c) {
    t = N;
    N = M;
    M = t;
    if (transA == cb_no_trans) {
      transA = cb_trans;
    } else {
      transA = cb_no_trans;
    }
  }

  ASSERT_BUF(A[0]);

  ctx = A[0]->ctx;

  cuda_enter(ctx);

  {
    double **T_l = alloca(sizeof(double *) * batchCount * 3);
    const double **A_l = (const double **)T_l;
    const double **x_l = (const double **)T_l + batchCount;
    double **y_l = T_l + (batchCount * 2);

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(x[i]);
      ASSERT_BUF(y[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(x[i], CUDA_WAIT_READ);
      cuda_wait(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      A_l[i] = (double *)(A[i]->ptr + offA[i]);
      x_l[i] = (double *)(x[i]->ptr + offX[i]);
      y_l[i] = (double *)(y[i]->ptr + offY[i]);
    }

    Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l,
                               GA_BUFFER_INIT, &err);
    if (Aa == NULL)
      return err;
    xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l,
                               GA_BUFFER_INIT, &err);
    if (xa == NULL) {
      cuda_ops.buffer_release(Aa);
      return err;
    }
    ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l,
                               GA_BUFFER_INIT, &err);
    if (ya == NULL) {
      cuda_ops.buffer_release(Aa);
      cuda_ops.buffer_release(xa);
      return err;
    }
  }

  args[0] = Aa;
  args[1] = &lda;
  args[2] = xa;
  args[3] = &incX;
  args[4] = ya;
  args[5] = &incY;
  args[6] = &batchCount;
  args[7] = &M;
  args[8] = &N;

  if (transA == cb_no_trans) {
    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
  } else {
    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
  }

  cuda_ops.buffer_release(Aa);
  cuda_ops.buffer_release(xa);
  cuda_ops.buffer_release(ya);

  if (err != GA_NO_ERROR) {
    cuda_exit(ctx);
    return err;
  }

  for (i = 0; i < batchCount; i++) {
    cuda_record(A[i], CUDA_WAIT_READ);
    cuda_record(x[i], CUDA_WAIT_READ);
    cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
                     gpudata **x, size_t *offX, size_t incX,
                     gpudata **y, size_t *offY, size_t incY,
                     gpudata **A, size_t *offA, size_t lda,
                     size_t batchCount, int flags) {
  cuda_context *ctx;
  size_t t, *tp, i;
  size_t ls[3] = {M, N, 1}, gs[3] = {1, 1, batchCount};
  void *args[10];
  gpudata **T;
  gpudata *Aa, *xa, *ya;
  int err;

  if (flags != 0) return GA_INVALID_ERROR;
  if (batchCount == 0) return GA_NO_ERROR;

  if (incX == 1) {
    if (ls[0] > 32) {
      gs[0] = (ls[0] + 31) / 32;
      ls[0] = 32;
    }
    if (ls[0] * ls[1] > 512) {
      gs[1] = (ls[1] + 15) / 16;
      ls[1] = 16;
    }
  } else {
    if (ls[1] > 32) {
      gs[1] = (ls[1] + 31) / 32;
      ls[1] = 32;
    }
    if (ls[0] * ls[1] > 512) {
      gs[0] = (ls[0] + 15) / 16;
      ls[0] = 16;
    }
  }
  if (gs[0] * gs[1] * gs[2] > 65535) {
    if (gs[0] * gs[1] > 65535)
      return GA_VALUE_ERROR;
    gs[2] = (65535 / (gs[0] * gs[1]));
  }

  if (order == cb_c) {
    t = M;
    M = N;
    N = t;
    tp = offX;
    offX = offY;
    offY = tp;
    t = incX;
    incX = incY;
    incY = t;
    T = x;
    x = y;
    y = T;
  }

  ASSERT_BUF(x[0]);

  ctx = x[0]->ctx;

  cuda_enter(ctx);

  {
    double **T_l = alloca(sizeof(double *) * batchCount * 3);
    const double **A_l = (const double **)T_l;
    const double **x_l = (const double **)T_l + batchCount;
    double **y_l = T_l + (batchCount * 2);

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(x[i]);
      ASSERT_BUF(y[i]);
      cuda_wait(A[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      cuda_wait(x[i], CUDA_WAIT_READ);
      cuda_wait(y[i], CUDA_WAIT_READ);
      A_l[i] = (double *)(A[i]->ptr + offA[i]);
      x_l[i] = (double *)(x[i]->ptr + offX[i]);
      y_l[i] = (double *)(y[i]->ptr + offY[i]);
    }

    Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l,
                               GA_BUFFER_INIT, &err);
    if (Aa == NULL)
      return err;
    xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l,
                               GA_BUFFER_INIT, &err);
    if (xa == NULL) {
      cuda_ops.buffer_release(Aa);
      return err;
    }
    ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l,
                               GA_BUFFER_INIT, &err);
    if (ya == NULL) {
      cuda_ops.buffer_release(Aa);
      cuda_ops.buffer_release(xa);
      return err;
    }
  }

  args[0] = xa;
  args[1] = &incX;
  args[2] = ya;
  args[3] = &incY;
  args[4] = &alpha;
  args[5] = Aa;
  args[6] = &lda;
  args[7] = &batchCount;
  args[8] = &M;
  args[9] = &N;

  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);

  cuda_ops.buffer_release(Aa);
  cuda_ops.buffer_release(xa);
  cuda_ops.buffer_release(ya);

  if (err != GA_NO_ERROR) {
    cuda_exit(ctx);
    return err;
  }


  for (i = 0; i < batchCount; i++) {
    cuda_record(A[i], CUDA_WAIT_READ);
    cuda_record(x[i], CUDA_WAIT_READ);
    cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}