static void teardown(void *c) { cuda_context *ctx = (cuda_context *)c; blas_handle *handle = (blas_handle *)ctx->blas_handle; if (ctx->blas_handle == NULL) return; cuda_enter(ctx); cublasDestroy(handle->h); GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small); GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small); GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small); GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small); GpuKernel_clear(&handle->sgerBH_gen_small); GpuKernel_clear(&handle->dgerBH_gen_small); cuda_exit(ctx); free(ctx->blas_handle); ctx->blas_handle = NULL; }
static int setup(void *c) { cuda_context *ctx = (cuda_context *)c; blas_handle *handle; const char *tmp[2]; cublasStatus_t err; int e; int types[10]; if (ctx->blas_handle != NULL) return GA_NO_ERROR; handle = calloc(1, sizeof(*handle)); if (handle == NULL) return GA_MEMORY_ERROR; cuda_enter(ctx); err = cublasCreate(&handle->h); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); free(handle); return GA_BLAS_ERROR; } err = cublasSetStream(handle->h, ctx->s); if (err != CUBLAS_STATUS_SUCCESS) { e = GA_BLAS_ERROR; goto e1; } cublasSetPointerMode(handle->h, CUBLAS_POINTER_MODE_HOST); cublasSetAtomicsMode(handle->h, CUBLAS_ATOMICS_ALLOWED); types[0] = GA_BUFFER; types[1] = GA_SIZE; types[2] = GA_BUFFER; types[3] = GA_SIZE; types[4] = GA_BUFFER; types[5] = GA_SIZE; types[6] = GA_SIZE; types[7] = GA_SIZE; types[8] = GA_SIZE; e = GpuKernel_init(&handle->sgemvBH_N_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_N_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL); if (e != GA_NO_ERROR) goto e1; e = GpuKernel_init(&handle->sgemvBH_T_a1_b1_small, &cuda_ops, ctx, 1, &code_sgemvBH_T_a1_b1_small, NULL, "sgemv", 9, types, 0, NULL); if (e != GA_NO_ERROR) goto e2; tmp[0] = atomicadd_double; tmp[1] = code_dgemvBH_N_a1_b1_small; e = GpuKernel_init(&handle->dgemvBH_N_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e3; tmp[0] = atomicadd_double; tmp[1] = code_dgemvBH_T_a1_b1_small; e = GpuKernel_init(&handle->dgemvBH_T_a1_b1_small, &cuda_ops, ctx, 2, tmp, NULL, "dgemv", 9, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e4; types[0] = GA_BUFFER; types[1] = GA_SIZE; types[2] = GA_BUFFER; types[3] = GA_SIZE; types[4] = GA_FLOAT; types[5] = GA_BUFFER; types[6] = GA_SIZE; types[7] = GA_SIZE; types[8] = GA_SIZE; types[9] = GA_SIZE; e = GpuKernel_init(&handle->sgerBH_gen_small, &cuda_ops, ctx, 1, &code_sgerBH_gen_small, NULL, "_sgerBH_gen_small", 10, types, 0, NULL); if (e != GA_NO_ERROR) goto e5; types[4] = GA_DOUBLE; tmp[0] = atomicadd_double; tmp[1] = code_dgerBH_gen_small; e = GpuKernel_init(&handle->dgerBH_gen_small, &cuda_ops, ctx, 2, tmp, NULL, "_dgerBH_gen_small", 10, types, GA_USE_DOUBLE, NULL); if (e != GA_NO_ERROR) goto e6; ctx->blas_handle = handle; cuda_exit(ctx); return GA_NO_ERROR; e6: GpuKernel_clear(&handle->sgerBH_gen_small); e5: GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small); e4: GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small); e3: GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small); e2: GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small); e1: cublasDestroy(handle->h); cuda_exit(ctx); free(handle); return e; }
int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i, int check_error) { size_t n[2], ls[2] = {0, 0}, gs[2] = {0, 0}; size_t pl; gpudata *errbuf; #if DEBUG char *errstr = NULL; #endif GpuKernel k; unsigned int j; unsigned int argp; int err, kerr = 0; int addr32 = 0; if (!GpuArray_ISWRITEABLE(a)) return GA_INVALID_ERROR; if (!GpuArray_ISALIGNED(a) || !GpuArray_ISALIGNED(v) || !GpuArray_ISALIGNED(i)) return GA_UNALIGNED_ERROR; /* a and i have to be C contiguous */ if (!GpuArray_IS_C_CONTIGUOUS(a) || !GpuArray_IS_C_CONTIGUOUS(i)) return GA_INVALID_ERROR; /* Check that the dimensions match namely a[0] == i[0] and a[>0] == v[>0] */ if (v->nd == 0 || a->nd == 0 || i->nd != 1 || a->nd != v->nd || a->dimensions[0] != i->dimensions[0]) return GA_INVALID_ERROR; n[0] = i->dimensions[0]; n[1] = 1; for (j = 1; j < v->nd; j++) { if (a->dimensions[j] != v->dimensions[j]) return GA_INVALID_ERROR; n[1] *= v->dimensions[j]; } if (n[0] * n[1] < SADDR32_MAX) { addr32 = 1; } err = gpudata_property(v->data, GA_CTX_PROP_ERRBUF, &errbuf); if (err != GA_NO_ERROR) return err; err = gen_take1_kernel(&k, GpuArray_context(a), #if DEBUG &errstr, #else NULL, #endif a, v, i, addr32); #if DEBUG if (errstr != NULL) { fprintf(stderr, "%s\n", errstr); free(errstr); } #endif if (err != GA_NO_ERROR) return err; err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]); if (err != GA_NO_ERROR) goto out; /* This may not be the best scheduling, but it's good enough */ err = gpukernel_property(k.k, GA_KERNEL_PROP_PREFLSIZE, &pl); ls[0] = ls[1] / pl; ls[1] = pl; if (n[1] > n[0]) { pl = ls[0]; ls[0] = ls[1]; ls[1] = pl; gs[0] = 1; } else { gs[0] = gs[1]; gs[1] = 1; } argp = 0; GpuKernel_setarg(&k, argp++, a->data); GpuKernel_setarg(&k, argp++, (void *)&a->offset); GpuKernel_setarg(&k, argp++, v->data); /* The cast is to avoid a warning about const */ GpuKernel_setarg(&k, argp++, (void *)&v->offset); for (j = 0; j < v->nd; j++) { GpuKernel_setarg(&k, argp++, &v->strides[j]); GpuKernel_setarg(&k, argp++, &v->dimensions[j]); } GpuKernel_setarg(&k, argp++, i->data); GpuKernel_setarg(&k, argp++, (void *)&i->offset); GpuKernel_setarg(&k, argp++, &n[0]); GpuKernel_setarg(&k, argp++, &n[1]); GpuKernel_setarg(&k, argp++, errbuf); err = GpuKernel_call(&k, 2, gs, ls, 0, NULL); if (check_error && err == GA_NO_ERROR) { err = gpudata_read(&kerr, errbuf, 0, sizeof(int)); if (err == GA_NO_ERROR && kerr != 0) { err = GA_VALUE_ERROR; kerr = 0; /* We suppose this will not fail */ gpudata_write(errbuf, 0, &kerr, sizeof(int)); } } out: GpuKernel_clear(&k); return err; }