static void teardown(void *c) {
  cuda_context *ctx = (cuda_context *)c;
  blas_handle *handle = (blas_handle *)ctx->blas_handle;

  if (ctx->blas_handle == NULL)
    return;

  cuda_enter(ctx);
  cublasDestroy(handle->h);
  GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small);
  GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small);
  GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small);
  GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small);
  GpuKernel_clear(&handle->sgerBH_gen_small);
  GpuKernel_clear(&handle->dgerBH_gen_small);
  cuda_exit(ctx);
  free(ctx->blas_handle);
  ctx->blas_handle = NULL;
}
static int setup(void *c) {
  cuda_context *ctx = (cuda_context *)c;
  blas_handle *handle;
  const char *tmp[2];
  cublasStatus_t err;
  int e;
  int types[10];

  if (ctx->blas_handle != NULL)
    return GA_NO_ERROR;

  handle = calloc(1, sizeof(*handle));
  if (handle == NULL)
    return GA_MEMORY_ERROR;

  cuda_enter(ctx);
  err = cublasCreate(&handle->h);
  if (err != CUBLAS_STATUS_SUCCESS) {
    cuda_exit(ctx);
    free(handle);
    return GA_BLAS_ERROR;
  }

  err = cublasSetStream(handle->h, ctx->s);
  if (err != CUBLAS_STATUS_SUCCESS) {
    e = GA_BLAS_ERROR;
    goto e1;
  }

  cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST);
  cublasSetAtomicsMode(handle->h, CUBLAS_ATOMICS_ALLOWED);

  types[0] = GA_BUFFER;
  types[1] = GA_SIZE;
  types[2] = GA_BUFFER;
  types[3] = GA_SIZE;
  types[4] = GA_BUFFER;
  types[5] = GA_SIZE;
  types[6] = GA_SIZE;
  types[7] = GA_SIZE;
  types[8] = GA_SIZE;
  e = GpuKernel_init(&handle->sgemvBH_N_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_N_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL);
  if (e != GA_NO_ERROR) goto e1;
  e = GpuKernel_init(&handle->sgemvBH_T_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_T_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL);
  if (e != GA_NO_ERROR) goto e2;
  tmp[0] = atomicadd_double;
  tmp[1] = code_dgemvBH_N_a1_b1_small;
  e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL);
  if (e != GA_NO_ERROR) goto e3;
  tmp[0] = atomicadd_double;
  tmp[1] = code_dgemvBH_T_a1_b1_small;
  e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL);
  if (e != GA_NO_ERROR) goto e4;

  types[0] = GA_BUFFER;
  types[1] = GA_SIZE;
  types[2] = GA_BUFFER;
  types[3] = GA_SIZE;
  types[4] = GA_FLOAT;
  types[5] = GA_BUFFER;
  types[6] = GA_SIZE;
  types[7] = GA_SIZE;
  types[8] = GA_SIZE;
  types[9] = GA_SIZE;
  e = GpuKernel_init(&handle->sgerBH_gen_small, &cuda_ops, ctx, 1, &code_sgerBH_gen_small, NULL, "_sgerBH_gen_small", 10, types, 0, NULL);
  if (e != GA_NO_ERROR) goto e5;
  types[4] = GA_DOUBLE;
  tmp[0] = atomicadd_double;
  tmp[1] = code_dgerBH_gen_small;
  e = GpuKernel_init(&handle->dgerBH_gen_small, &cuda_ops, ctx, 2, tmp, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL);
  if (e != GA_NO_ERROR) goto e6;

  ctx->blas_handle = handle;

  cuda_exit(ctx);

  return GA_NO_ERROR;

 e6:
  GpuKernel_clear(&handle->sgerBH_gen_small);
 e5:
  GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small);
 e4:
  GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small);
 e3:
  GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small);
 e2:
  GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small);
 e1:
  cublasDestroy(handle->h);
  cuda_exit(ctx);
  free(handle);
  return e;
}
Example #3
0
int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
                   int check_error) {
  size_t n[2], ls[2] = {0, 0}, gs[2] = {0, 0};
  size_t pl;
  gpudata *errbuf;
#if DEBUG
  char *errstr = NULL;
#endif
  GpuKernel k;
  unsigned int j;
  unsigned int argp;
  int err, kerr = 0;
  int addr32 = 0;

  if (!GpuArray_ISWRITEABLE(a))
    return GA_INVALID_ERROR;

  if (!GpuArray_ISALIGNED(a) || !GpuArray_ISALIGNED(v) ||
      !GpuArray_ISALIGNED(i))
    return GA_UNALIGNED_ERROR;

  /* a and i have to be C contiguous */
  if (!GpuArray_IS_C_CONTIGUOUS(a) || !GpuArray_IS_C_CONTIGUOUS(i))
    return GA_INVALID_ERROR;

  /* Check that the dimensions match namely a[0] == i[0] and a[>0] == v[>0] */
  if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd ||
      a->dimensions[0] != i->dimensions[0])
    return GA_INVALID_ERROR;

  n[0] = i->dimensions[0];
  n[1] = 1;

  for (j = 1; j < v->nd; j++) {
    if (a->dimensions[j] != v->dimensions[j])
      return GA_INVALID_ERROR;
    n[1] *= v->dimensions[j];
  }

  if (n[0] * n[1] < SADDR32_MAX) {
    addr32 = 1;
  }

  err = gpudata_property(v->data, GA_CTX_PROP_ERRBUF, &errbuf);
  if (err != GA_NO_ERROR)
    return err;

  err = gen_take1_kernel(&k, GpuArray_context(a),
#if DEBUG
                         &errstr,
#else
                         NULL,
#endif
                         a, v, i, addr32);
#if DEBUG
  if (errstr != NULL) {
    fprintf(stderr, "%s\n", errstr);
    free(errstr);
  }
#endif
  if (err != GA_NO_ERROR)
    return err;

  err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]);
  if (err != GA_NO_ERROR)
    goto out;

  /* This may not be the best scheduling, but it's good enough */
  err = gpukernel_property(k.k, GA_KERNEL_PROP_PREFLSIZE, &pl);
  ls[0] = ls[1] / pl;
  ls[1] = pl;
  if (n[1] > n[0]) {
    pl = ls[0];
    ls[0] = ls[1];
    ls[1] = pl;
    gs[0] = 1;
  } else {
    gs[0] = gs[1];
    gs[1] = 1;
  }

  argp = 0;
  GpuKernel_setarg(&k, argp++, a->data);
  GpuKernel_setarg(&k, argp++, (void *)&a->offset);
  GpuKernel_setarg(&k, argp++, v->data);
  /* The cast is to avoid a warning about const */
  GpuKernel_setarg(&k, argp++, (void *)&v->offset);
  for (j = 0; j < v->nd; j++) {
    GpuKernel_setarg(&k, argp++, &v->strides[j]);
    GpuKernel_setarg(&k, argp++, &v->dimensions[j]);
  }
  GpuKernel_setarg(&k, argp++, i->data);
  GpuKernel_setarg(&k, argp++, (void *)&i->offset);
  GpuKernel_setarg(&k, argp++, &n[0]);
  GpuKernel_setarg(&k, argp++, &n[1]);
  GpuKernel_setarg(&k, argp++, errbuf);

  err = GpuKernel_call(&k, 2, gs, ls, 0, NULL);
  if (check_error && err == GA_NO_ERROR) {
    err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
    if (err == GA_NO_ERROR && kerr != 0) {
      err = GA_VALUE_ERROR;
      kerr = 0;
      /* We suppose this will not fail */
      gpudata_write(errbuf, 0, &kerr, sizeof(int));
    }
  }

out:
  GpuKernel_clear(&k);
  return err;
}