Beispiel #1
0
int APPLY_SPECIFIC(tstgpueye)(PyArrayObject *n, PyArrayObject *m,
                              PyGpuArrayObject **z, PyGpuContextObject *ctx) {
  size_t dims[2] = {0, 0};
  size_t ls, gs;
  void *args[3];
  int err;

  dims[0] = ((DTYPE_INPUT_0 *)PyArray_DATA(n))[0];
  dims[1] = ((DTYPE_INPUT_1 *)PyArray_DATA(m))[0];

  Py_XDECREF(*z);
  *z = pygpu_zeros(2, dims,
                   TYPECODE,
                   GA_C_ORDER,
                   ctx, Py_None);
  if (*z == NULL)
    return -1;

  args[0] = (*z)->ga.data;
  args[1] = &dims[0];
  args[2] = &dims[1];
  ls = 1;
  gs = 256;
  /* The k_eye name comes from the kernel declaration above. */
  err = GpuKernel_call(&k_eye, 1, &ls, &gs, 0, args);
  if (err != GA_NO_ERROR) {
    PyErr_Format(PyExc_RuntimeError,
                 "gpuarray error: kEye: %s. n%lu, m=%lu.",
                 GpuKernel_error(&k_eye, err),
                 (unsigned long)dims[0], (unsigned long)dims[1]);
    return -1;
  }
  return 0;
}
static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx){
	void* args[11];

	/**
	 * Argument Marshalling. This the grossest gross thing in here.
	 */

	const int flags       = GA_BUFFER_READ_ONLY|GA_BUFFER_INIT;
	ctx->srcStepsGD       = gpudata_alloc(ctx->gpuCtx, ctx->nds    * sizeof(size_t),
	                                      ctx->src->strides,       flags, 0);
	ctx->srcSizeGD        = gpudata_alloc(ctx->gpuCtx, ctx->nds    * sizeof(size_t),
	                                      ctx->src->dimensions,    flags, 0);
	ctx->chunkSizeGD      = gpudata_alloc(ctx->gpuCtx, ctx->ndh * sizeof(size_t),
	                                      ctx->chunkSize,          flags, 0);
	ctx->dstMaxStepsGD    = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
	                                      ctx->dstMax->strides,    flags, 0);
	ctx->dstArgmaxStepsGD = gpudata_alloc(ctx->gpuCtx, ctx->ndd * sizeof(size_t),
	                                      ctx->dstArgmax->strides, flags, 0);
	args[ 0] = (void*) ctx->src->data;
	args[ 1] = (void*)&ctx->src->offset;
	args[ 2] = (void*) ctx->srcStepsGD;
	args[ 3] = (void*) ctx->srcSizeGD;
	args[ 4] = (void*) ctx->chunkSizeGD;
	args[ 5] = (void*) ctx->dstMax->data;
	args[ 6] = (void*)&ctx->dstMax->offset;
	args[ 7] = (void*) ctx->dstMaxStepsGD;
	args[ 8] = (void*) ctx->dstArgmax->data;
	args[ 9] = (void*)&ctx->dstArgmax->offset;
	args[10] = (void*) ctx->dstArgmaxStepsGD;

	if(ctx->srcStepsGD      &&
	   ctx->srcSizeGD       &&
	   ctx->chunkSizeGD     &&
	   ctx->dstMaxStepsGD   &&
	   ctx->dstArgmaxStepsGD){
		ctx->ret = GpuKernel_call(&ctx->kernel,
		                          ctx->ndh>0 ? ctx->ndh : 1,
		                          ctx->gridSize,
		                          ctx->blockSize,
		                          0,
		                          args);
	}else{
		ctx->ret = GA_MEMORY_ERROR;
	}

	gpudata_release(ctx->srcStepsGD);
	gpudata_release(ctx->srcSizeGD);
	gpudata_release(ctx->chunkSizeGD);
	gpudata_release(ctx->dstMaxStepsGD);
	gpudata_release(ctx->dstArgmaxStepsGD);

	return ctx->ret;
}
static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
                     gpudata **x, size_t *offX, size_t incX,
                     gpudata **y, size_t *offY, size_t incY,
                     gpudata **A, size_t *offA, size_t lda,
                     size_t batchCount, int flags) {
  cuda_context *ctx;
  size_t t, *tp, i;
  size_t ls[3] = {M, N, 1}, gs[3] = {1, 1, batchCount};
  void *args[10];
  gpudata **T;
  gpudata *Aa, *xa, *ya;
  int err;

  if (flags != 0) return GA_INVALID_ERROR;
  if (batchCount == 0) return GA_NO_ERROR;

  if (incX == 1) {
    if (ls[0] > 32) {
      gs[0] = (ls[0] + 31) / 32;
      ls[0] = 32;
    }
    if (ls[0] * ls[1] > 512) {
      gs[1] = (ls[1] + 15) / 16;
      ls[1] = 16;
    }
  } else {
    if (ls[1] > 32) {
      gs[1] = (ls[1] + 31) / 32;
      ls[1] = 32;
    }
    if (ls[0] * ls[1] > 512) {
      gs[0] = (ls[0] + 15) / 16;
      ls[0] = 16;
    }
  }
  if (gs[0] * gs[1] * gs[2] > 65535) {
    if (gs[0] * gs[1] > 65535)
      return GA_VALUE_ERROR;
    gs[2] = (65535 / (gs[0] * gs[1]));
  }

  if (order == cb_c) {
    t = M;
    M = N;
    N = t;
    tp = offX;
    offX = offY;
    offY = tp;
    t = incX;
    incX = incY;
    incY = t;
    T = x;
    x = y;
    y = T;
  }

  ASSERT_BUF(x[0]);

  ctx = x[0]->ctx;

  cuda_enter(ctx);

  {
    double **T_l = alloca(sizeof(double *) * batchCount * 3);
    const double **A_l = (const double **)T_l;
    const double **x_l = (const double **)T_l + batchCount;
    double **y_l = T_l + (batchCount * 2);

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(x[i]);
      ASSERT_BUF(y[i]);
      cuda_wait(A[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      cuda_wait(x[i], CUDA_WAIT_READ);
      cuda_wait(y[i], CUDA_WAIT_READ);
      A_l[i] = (double *)(A[i]->ptr + offA[i]);
      x_l[i] = (double *)(x[i]->ptr + offX[i]);
      y_l[i] = (double *)(y[i]->ptr + offY[i]);
    }

    Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l,
                               GA_BUFFER_INIT, &err);
    if (Aa == NULL)
      return err;
    xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l,
                               GA_BUFFER_INIT, &err);
    if (xa == NULL) {
      cuda_ops.buffer_release(Aa);
      return err;
    }
    ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l,
                               GA_BUFFER_INIT, &err);
    if (ya == NULL) {
      cuda_ops.buffer_release(Aa);
      cuda_ops.buffer_release(xa);
      return err;
    }
  }

  args[0] = xa;
  args[1] = &incX;
  args[2] = ya;
  args[3] = &incY;
  args[4] = &alpha;
  args[5] = Aa;
  args[6] = &lda;
  args[7] = &batchCount;
  args[8] = &M;
  args[9] = &N;

  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);

  cuda_ops.buffer_release(Aa);
  cuda_ops.buffer_release(xa);
  cuda_ops.buffer_release(ya);

  if (err != GA_NO_ERROR) {
    cuda_exit(ctx);
    return err;
  }


  for (i = 0; i < batchCount; i++) {
    cuda_record(A[i], CUDA_WAIT_READ);
    cuda_record(x[i], CUDA_WAIT_READ);
    cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
static int dgemvBatch(cb_order order, cb_transpose transA,
                      size_t M, size_t N, double alpha,
                      gpudata **A, size_t *offA, size_t lda,
                      gpudata **x, size_t *offX, size_t incX,
                      double beta, gpudata **y, size_t *offY, size_t incY,
                      size_t batchCount, int flags) {
  cuda_context *ctx;
  size_t t, i;
  size_t ls[2], gs[2];
  void *args[9];
  gpudata *Aa, *xa, *ya;
  int err;

  if (flags != 0) return GA_INVALID_ERROR;
  if (batchCount == 0) return GA_NO_ERROR;

  if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR;

  if (M < 512) {
    ls[0] = 32;
    if (batchCount > 16)
      ls[1] = 16;
    else
      ls[1] = batchCount;
  } else {
    ls[0] = 512;
    ls[1] = 1;
  }
  gs[0] = (M + ls[0] - 1) / ls[0];
  gs[1] = (batchCount + ls[1] - 1) / ls[1];
  if (gs[0] * gs[1] / 65535) {
    gs[1] = (65535 / gs[0]);
  }

  if (order == cb_c) {
    t = N;
    N = M;
    M = t;
    if (transA == cb_no_trans) {
      transA = cb_trans;
    } else {
      transA = cb_no_trans;
    }
  }

  ASSERT_BUF(A[0]);

  ctx = A[0]->ctx;

  cuda_enter(ctx);

  {
    double **T_l = alloca(sizeof(double *) * batchCount * 3);
    const double **A_l = (const double **)T_l;
    const double **x_l = (const double **)T_l + batchCount;
    double **y_l = T_l + (batchCount * 2);

    for (i = 0; i < batchCount; i++) {
      ASSERT_BUF(A[i]);
      ASSERT_BUF(x[i]);
      ASSERT_BUF(y[i]);
      cuda_wait(A[i], CUDA_WAIT_READ);
      cuda_wait(x[i], CUDA_WAIT_READ);
      cuda_wait(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
      A_l[i] = (double *)(A[i]->ptr + offA[i]);
      x_l[i] = (double *)(x[i]->ptr + offX[i]);
      y_l[i] = (double *)(y[i]->ptr + offY[i]);
    }

    Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l,
                               GA_BUFFER_INIT, &err);
    if (Aa == NULL)
      return err;
    xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l,
                               GA_BUFFER_INIT, &err);
    if (xa == NULL) {
      cuda_ops.buffer_release(Aa);
      return err;
    }
    ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l,
                               GA_BUFFER_INIT, &err);
    if (ya == NULL) {
      cuda_ops.buffer_release(Aa);
      cuda_ops.buffer_release(xa);
      return err;
    }
  }

  args[0] = Aa;
  args[1] = &lda;
  args[2] = xa;
  args[3] = &incX;
  args[4] = ya;
  args[5] = &incY;
  args[6] = &batchCount;
  args[7] = &M;
  args[8] = &N;

  if (transA == cb_no_trans) {
    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
  } else {
    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
  }

  cuda_ops.buffer_release(Aa);
  cuda_ops.buffer_release(xa);
  cuda_ops.buffer_release(ya);

  if (err != GA_NO_ERROR) {
    cuda_exit(ctx);
    return err;
  }

  for (i = 0; i < batchCount; i++) {
    cuda_record(A[i], CUDA_WAIT_READ);
    cuda_record(x[i], CUDA_WAIT_READ);
    cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE);
  }

  cuda_exit(ctx);
  return GA_NO_ERROR;
}
Beispiel #5
0
int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
                   int check_error) {
  size_t n[2], ls[2] = {0, 0}, gs[2] = {0, 0};
  size_t pl;
  gpudata *errbuf;
#if DEBUG
  char *errstr = NULL;
#endif
  GpuKernel k;
  unsigned int j;
  unsigned int argp;
  int err, kerr = 0;
  int addr32 = 0;

  if (!GpuArray_ISWRITEABLE(a))
    return GA_INVALID_ERROR;

  if (!GpuArray_ISALIGNED(a) || !GpuArray_ISALIGNED(v) ||
      !GpuArray_ISALIGNED(i))
    return GA_UNALIGNED_ERROR;

  /* a and i have to be C contiguous */
  if (!GpuArray_IS_C_CONTIGUOUS(a) || !GpuArray_IS_C_CONTIGUOUS(i))
    return GA_INVALID_ERROR;

  /* Check that the dimensions match namely a[0] == i[0] and a[>0] == v[>0] */
  if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd ||
      a->dimensions[0] != i->dimensions[0])
    return GA_INVALID_ERROR;

  n[0] = i->dimensions[0];
  n[1] = 1;

  for (j = 1; j < v->nd; j++) {
    if (a->dimensions[j] != v->dimensions[j])
      return GA_INVALID_ERROR;
    n[1] *= v->dimensions[j];
  }

  if (n[0] * n[1] < SADDR32_MAX) {
    addr32 = 1;
  }

  err = gpudata_property(v->data, GA_CTX_PROP_ERRBUF, &errbuf);
  if (err != GA_NO_ERROR)
    return err;

  err = gen_take1_kernel(&k, GpuArray_context(a),
#if DEBUG
                         &errstr,
#else
                         NULL,
#endif
                         a, v, i, addr32);
#if DEBUG
  if (errstr != NULL) {
    fprintf(stderr, "%s\n", errstr);
    free(errstr);
  }
#endif
  if (err != GA_NO_ERROR)
    return err;

  err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]);
  if (err != GA_NO_ERROR)
    goto out;

  /* This may not be the best scheduling, but it's good enough */
  err = gpukernel_property(k.k, GA_KERNEL_PROP_PREFLSIZE, &pl);
  ls[0] = ls[1] / pl;
  ls[1] = pl;
  if (n[1] > n[0]) {
    pl = ls[0];
    ls[0] = ls[1];
    ls[1] = pl;
    gs[0] = 1;
  } else {
    gs[0] = gs[1];
    gs[1] = 1;
  }

  argp = 0;
  GpuKernel_setarg(&k, argp++, a->data);
  GpuKernel_setarg(&k, argp++, (void *)&a->offset);
  GpuKernel_setarg(&k, argp++, v->data);
  /* The cast is to avoid a warning about const */
  GpuKernel_setarg(&k, argp++, (void *)&v->offset);
  for (j = 0; j < v->nd; j++) {
    GpuKernel_setarg(&k, argp++, &v->strides[j]);
    GpuKernel_setarg(&k, argp++, &v->dimensions[j]);
  }
  GpuKernel_setarg(&k, argp++, i->data);
  GpuKernel_setarg(&k, argp++, (void *)&i->offset);
  GpuKernel_setarg(&k, argp++, &n[0]);
  GpuKernel_setarg(&k, argp++, &n[1]);
  GpuKernel_setarg(&k, argp++, errbuf);

  err = GpuKernel_call(&k, 2, gs, ls, 0, NULL);
  if (check_error && err == GA_NO_ERROR) {
    err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
    if (err == GA_NO_ERROR && kerr != 0) {
      err = GA_VALUE_ERROR;
      kerr = 0;
      /* We suppose this will not fail */
      gpudata_write(errbuf, 0, &kerr, sizeof(int));
    }
  }

out:
  GpuKernel_clear(&k);
  return err;
}