static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; gpudata *next, *curr; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); blas_ops->teardown(ctx); } cuMemFreeHost((void *)ctx->errbuf->ptr); deallocate(ctx->errbuf); cuStreamDestroy(ctx->s); /* Clear out the freelist */ for (curr = ctx->freeblocks; curr != NULL; curr = next) { next = curr->next; cuMemFree(curr->ptr); deallocate(curr); } if (!(ctx->flags & DONTFREE)) cuCtxDestroy(ctx->ctx); cache_destroy(ctx->extcopy_cache); CLEAR(ctx); free(ctx); } }
static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { if (ctx->blas_handle != NULL) { ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); blas_ops->teardown(ctx); } cuStreamDestroy(ctx->s); if (!(ctx->flags & DONTFREE)) cuCtxDestroy(ctx->ctx); cache_free(ctx->extcopy_cache); CLEAR(ctx); free(ctx); } }
static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); blas_ops->teardown(ctx); } ctx->refcnt = 2; /* Prevent recursive calls */ cuda_free(ctx->errbuf); cuStreamDestroy(ctx->s); if (!(ctx->flags & DONTFREE)) cuCtxDestroy(ctx->ctx); cache_free(ctx->extcopy_cache); CLEAR(ctx); free(ctx); } }
static int cuda_extcopy(gpudata *input, size_t ioff, gpudata *output, size_t ooff, int intype, int outtype, unsigned int a_nd, const size_t *a_dims, const ssize_t *a_str, unsigned int b_nd, const size_t *b_dims, const ssize_t *b_str) { cuda_context *ctx = input->ctx; void *args[2]; int res = GA_SYS_ERROR; unsigned int i; size_t nEls = 1, ls, gs; gpukernel *k; extcopy_args a, *aa; ASSERT_BUF(input); ASSERT_BUF(output); if (input->ctx != output->ctx) return GA_INVALID_ERROR; for (i = 0; i < a_nd; i++) { nEls *= a_dims[i]; } if (nEls == 0) return GA_NO_ERROR; a.ind = a_nd; a.ond = b_nd; a.itype = intype; a.otype = outtype; a.ioff = ioff; a.ooff = ooff; a.idims = a_dims; a.odims = b_dims; a.istr = a_str; a.ostr = b_str; k = cache_get(ctx->extcopy_cache, &a); if (k == NULL) { res = gen_extcopy_kernel(&a, input->ctx, &k, nEls); if (res != GA_NO_ERROR) return res; /* Cache the kernel */ aa = memdup(&a, sizeof(a)); if (aa == NULL) goto done; aa->idims = memdup(a_dims, a_nd*sizeof(size_t)); aa->odims = memdup(b_dims, b_nd*sizeof(size_t)); aa->istr = memdup(a_str, a_nd*sizeof(ssize_t)); aa->ostr = memdup(b_str, b_nd*sizeof(ssize_t)); if (aa->idims == NULL || aa->odims == NULL || aa->istr == NULL || aa->ostr == NULL) { extcopy_free(aa); goto done; } /* One ref is given to the cache, we manage the other */ cuda_retainkernel(k); cache_add(ctx->extcopy_cache, aa, k); } else { /* This is our reference */ cuda_retainkernel(k); } done: /* Cheap kernel scheduling */ res = cuda_property(NULL, NULL, k, GA_KERNEL_PROP_MAXLSIZE, &ls); if (res != GA_NO_ERROR) goto fail; gs = ((nEls-1) / ls) + 1; args[0] = input; args[1] = output; res = cuda_callkernel(k, 1, &ls, &gs, 0, args); fail: /* We free our reference here */ cuda_freekernel(k); return res; }