static int cuda_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz) { cuda_context *ctx = dst->ctx; int res = GA_NO_ERROR; ASSERT_BUF(dst); ASSERT_BUF(src); if (src->ctx != dst->ctx) return GA_VALUE_ERROR; if (sz == 0) return GA_NO_ERROR; if ((dst->sz - dstoff) < sz || (src->sz - srcoff) < sz) return GA_VALUE_ERROR; cuda_enter(ctx); cuda_wait(src, CUDA_WAIT_READ); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemcpyDtoDAsync(dst->ptr + dstoff, src->ptr + srcoff, sz, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_record(src, CUDA_WAIT_READ); cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return res; }
gpudata *cuda_make_buf(void *c, CUdeviceptr p, size_t sz) { cuda_context *ctx = (cuda_context *)c; gpudata *res; int flags = CU_EVENT_DISABLE_TIMING; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->refcnt = 1; cuda_enter(ctx); if (ctx->err != CUDA_SUCCESS) { free(res); return NULL; } res->ptr = p; if (ctx->flags & GA_CTX_MULTI_THREAD) flags |= CU_EVENT_BLOCKING_SYNC; ctx->err = cuEventCreate(&res->ev, flags); if (ctx->err != CUDA_SUCCESS) { free(res); cuda_exit(ctx); return NULL; } res->sz = sz; res->flags = DONTFREE; res->ctx = ctx; ctx->refcnt++; cuda_exit(ctx); TAG_BUF(res); return res; }
static int cuda_write(gpudata *dst, size_t dstoff, const void *src, size_t sz) { cuda_context *ctx = dst->ctx; ASSERT_BUF(dst); if (sz == 0) return GA_NO_ERROR; if ((dst->sz - dstoff) < sz) return GA_VALUE_ERROR; cuda_enter(ctx); if (dst->flags & CUDA_MAPPED_PTR) { ctx->err = cuEventSynchronize(dst->rev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } memcpy((void *)(dst->ptr + dstoff), src, sz); } else { cuda_waits(dst, CUDA_WAIT_WRITE, ctx->mem_s); ctx->err = cuMemcpyHtoDAsync(dst->ptr + dstoff, src, sz, ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_records(dst, CUDA_WAIT_WRITE, ctx->mem_s); } cuda_exit(ctx); return GA_NO_ERROR; }
static int cuda_read(void *dst, gpudata *src, size_t srcoff, size_t sz) { cuda_context *ctx = src->ctx; ASSERT_BUF(src); if (sz == 0) return GA_NO_ERROR; if ((src->sz - srcoff) < sz) return GA_VALUE_ERROR; cuda_enter(ctx); if (src->flags & CUDA_MAPPED_PTR) { ctx->err = cuEventSynchronize(src->wev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } memcpy(dst, (void *)(src->ptr + srcoff), sz); } else { cuda_waits(src, CUDA_WAIT_READ, ctx->mem_s); ctx->err = cuMemcpyDtoHAsync(dst, src->ptr + srcoff, sz, ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_records(src, CUDA_WAIT_READ, ctx->mem_s); } cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; gpudata *T; size_t t; cublasStatus_t err; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, ((double *)A->ptr) + offA, lda, ((double *)B->ptr) + offB, ldb, &beta, ((double *)C->ptr) + offC, ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(B, CUDA_WAIT_READ); cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *td; size_t t; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(A); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; if (order == cb_c) { t = M; M = N; N = t; t = offX; offX = offY; offY = t; t = incX; incX = incY; incY = t; td = X; X = Y; Y = td; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL)); h->err = cublasDger(h->h, M, N, &alpha, ((double *)X->ptr) + offX, incX, ((double *)Y->ptr) + offY, incY, ((double *)A->ptr) + offA, lda); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static gpudata *cuda_transfer(gpudata *src, size_t offset, size_t sz, void *dst_c, int may_share) { cuda_context *ctx = src->ctx; cuda_context *dst_ctx = (cuda_context *)dst_c; gpudata *dst; ASSERT_BUF(src); ASSERT_CTX(ctx); ASSERT_CTX(dst_ctx); if (ctx == dst_ctx) { if (may_share && offset == 0) { cuda_retain(src); return src; } dst = cuda_alloc(ctx, sz, NULL, 0, NULL); if (dst == NULL) return NULL; cuda_enter(ctx); cuda_wait(src, CUDA_WAIT_READ); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemcpyDtoDAsync(dst->ptr, src->ptr+offset, sz, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); cuda_free(dst); return NULL; } cuda_record(src, CUDA_WAIT_READ); cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return dst; } dst = cuda_alloc(dst_ctx, sz, NULL, 0, NULL); if (dst == NULL) return NULL; cuda_enter(ctx); cuda_waits(src, CUDA_WAIT_READ, dst_ctx->mem_s); cuda_waits(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s); ctx->err = cuMemcpyPeerAsync(dst->ptr, dst->ctx->ctx, src->ptr+offset, src->ctx->ctx, sz, dst_ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_free(dst); cuda_exit(ctx); return NULL; } cuda_records(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s); cuda_records(src, CUDA_WAIT_READ, dst_ctx->mem_s); cuda_exit(ctx); return dst; }
static gpudata *cuda_alloc(void *c, size_t size, void *data, int flags, int *ret) { gpudata *res; cuda_context *ctx = (cuda_context *)c; int fl = CU_EVENT_DISABLE_TIMING; if ((flags & GA_BUFFER_INIT) && data == NULL) FAIL(NULL, GA_VALUE_ERROR); if ((flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) == (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) FAIL(NULL, GA_VALUE_ERROR); /* TODO: figure out how to make this work */ if (flags & GA_BUFFER_HOST) FAIL(NULL, GA_DEVSUP_ERROR); res = malloc(sizeof(*res)); if (res == NULL) FAIL(NULL, GA_SYS_ERROR); res->refcnt = 1; res->sz = size; res->flags = flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY); cuda_enter(ctx); if (ctx->err != CUDA_SUCCESS) { free(res); FAIL(NULL, GA_IMPL_ERROR); } if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; ctx->err = cuEventCreate(&res->ev, fl); if (ctx->err != CUDA_SUCCESS) { free(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } if (size == 0) size = 1; ctx->err = cuMemAlloc(&res->ptr, size); if (ctx->err != CUDA_SUCCESS) { cuEventDestroy(res->ev); free(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; if (flags & GA_BUFFER_INIT) { ctx->err = cuMemcpyHtoD(res->ptr, data, size); if (ctx->err != CUDA_SUCCESS) { cuda_free(res); FAIL(NULL, GA_IMPL_ERROR) }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL)); h->err = cublasDgemv(h->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; gpudata *td; size_t t; cublasStatus_t err; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(A); if (order == cb_c) { t = M; M = N; N = t; t = offX; offX = offY; offY = t; t = incX; incX = incY; incY = t; td = X; X = Y; Y = td; } cuda_enter(ctx); cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ); cuda_wait(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDger(((blas_handle *)ctx->blas_handle)->h, M, N, &alpha, ((double *)X->ptr) + offX, incX, ((double *)Y->ptr) + offY, incY, ((double *)A->ptr) + offA, lda); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(X, CUDA_WAIT_READ); cuda_record(Y, CUDA_WAIT_READ); cuda_record(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; cublasStatus_t err; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemv(((blas_handle *)ctx->blas_handle)->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(X, CUDA_WAIT_READ); cuda_record(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int cuda_callkernel(gpukernel *k, unsigned int n, const size_t *bs, const size_t *gs, size_t shared, void **args) { cuda_context *ctx = k->ctx; unsigned int i; ASSERT_KER(k); cuda_enter(ctx); for (i = 0; i < k->argcount; i++) { if (k->types[i] == GA_BUFFER) { /* We don't have any better info for now */ cuda_wait((gpudata *)args[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } switch (n) { case 1: ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared, ctx->s, args, NULL); break; case 2: ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared, ctx->s, args, NULL); break; case 3: ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2], shared, ctx->s, args, NULL); break; default: cuda_exit(ctx); return GA_VALUE_ERROR; } if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } for (i = 0; i < k->argcount; i++) { if (k->types[i] == GA_BUFFER) { /* We don't have any better info for now */ cuda_record((gpudata *)args[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } cuda_exit(ctx); return GA_NO_ERROR; }
/** * \brief NCCL implementation of \ref gpucomm_new. */ static int comm_new(gpucomm **comm_ptr, gpucontext *ctx, gpucommCliqueId comm_id, int ndev, int rank) { gpucomm *comm; ncclResult_t err; ASSERT_CTX(ctx); GA_CHECK(setup_lib(ctx->err)); comm = calloc(1, sizeof(*comm)); // Allocate memory if (comm == NULL) { *comm_ptr = NULL; // Set to NULL if failed return error_sys(ctx->err, "calloc"); } comm->ctx = (cuda_context *)ctx; // convert to underlying cuda context // So that context would not be destroyed before communicator comm->ctx->refcnt++; cuda_enter(comm->ctx); // Use device err = ncclCommInitRank(&comm->c, ndev, *((ncclUniqueId *)&comm_id), rank); cuda_exit(comm->ctx); TAG_COMM(comm); if (err != ncclSuccess) { *comm_ptr = NULL; // Set to NULL if failed comm_clear(comm); return error_nccl(ctx->err, "ncclCommInitRank", err); } *comm_ptr = comm; return GA_NO_ERROR; }
/** * \brief NCCL implementation of \ref gpucomm_broadcast. */ static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up ncclDataType_t datatype = ncclNumTypes; int rank = 0; cuda_context *ctx; ASSERT_BUF(array); ASSERT_COMM(comm); GA_CHECK(check_restrictions(array, offset, NULL, 0, count, typecode, 0, comm, &datatype, NULL)); GA_CHECK(get_rank(comm, &rank)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_READ)); else GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void *)(array->ptr + offset), count, datatype, root, comm->c, ctx->s)); if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_READ)); else GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
/** * \brief NCCL implementation of \ref gpucomm_all_reduce. */ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(check_restrictions(src, offsrc, dest, offdest, count, typecode, opcode, comm, &datatype, &op)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
/** * \brief NCCL implementation of \ref gpucomm_free. */ static void comm_free(gpucomm *comm) { ASSERT_COMM(comm); cuda_enter(comm->ctx); ncclCommDestroy(comm->c); cuda_exit(comm->ctx); comm_clear(comm); }
static void deallocate(gpudata *d) { cuda_enter(d->ctx); cuEventDestroy(d->rev); cuEventDestroy(d->wev); cuda_exit(d->ctx); CLEAR(d); free(d); }
static int cuda_records(gpudata *a, int flags, CUstream s) { ASSERT_BUF(a); cuda_enter(a->ctx); if (flags & CUDA_WAIT_READ) a->ctx->err = cuEventRecord(a->rev, s); if (flags & CUDA_WAIT_WRITE) a->ctx->err = cuEventRecord(a->wev, s); cuda_exit(a->ctx); return GA_NO_ERROR; }
/* * Allocate a new block and place in on the freelist. Will allocate * the bigger of the requested size and BLOCK_SIZE to avoid allocating * multiple small blocks. */ static int allocate(cuda_context *ctx, gpudata **res, gpudata **prev, size_t size) { CUdeviceptr ptr; gpudata *next; *prev = NULL; if (!(ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE)) if (size < BLOCK_SIZE) size = BLOCK_SIZE; cuda_enter(ctx); ctx->err = cuMemAlloc(&ptr, size); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *res = new_gpudata(ctx, ptr, size); cuda_exit(ctx); if (*res == NULL) { cuMemFree(ptr); return GA_MEMORY_ERROR; } (*res)->flags |= CUDA_HEAD_ALLOC; /* Now that the block is allocated, enter it in the freelist */ next = ctx->freeblocks; for (; next && next->ptr < (*res)->ptr; next = next->next) { *prev = next; } (*res)->next = next; if (*prev) (*prev)->next = *res; else ctx->freeblocks = *res; return GA_NO_ERROR; }
static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size) { gpudata *res; int fl = CU_EVENT_DISABLE_TIMING; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->refcnt = 0; res->sz = size; res->flags = 0; cuda_enter(ctx); if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; ctx->err = cuEventCreate(&res->rev, fl); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); free(res); return NULL; } ctx->err = cuEventCreate(&res->wev, fl); if (ctx->err != CUDA_SUCCESS) { cuEventDestroy(res->rev); cuda_exit(ctx); free(res); return NULL; } cuda_exit(ctx); res->ptr = ptr; res->next = NULL; res->ctx = ctx; TAG_BUF(res); return res; }
static int cuda_memset(gpudata *dst, size_t dstoff, int data) { cuda_context *ctx = dst->ctx; ASSERT_BUF(dst); if ((dst->sz - dstoff) == 0) return GA_NO_ERROR; cuda_enter(ctx); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemsetD8Async(dst->ptr + dstoff, data, dst->sz - dstoff, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int cuda_waits(gpudata *a, int flags, CUstream s) { ASSERT_BUF(a); /* If others are only reads, no need to wait */ cuda_enter(a->ctx); if (flags & CUDA_WAIT_READ) { /* We wait for writes that happened before since multiple reads at * the same time are fine */ a->ctx->err = cuStreamWaitEvent(s, a->wev, 0); if (a->ctx->err != CUDA_SUCCESS) { cuda_exit(a->ctx); return GA_IMPL_ERROR; } } if (flags & CUDA_WAIT_WRITE) { /* Make sure to not disturb previous reads */ a->ctx->err = cuStreamWaitEvent(s, a->rev, 0); if (a->ctx->err != CUDA_SUCCESS) { cuda_exit(a->ctx); return GA_IMPL_ERROR; } } cuda_exit(a->ctx); return GA_NO_ERROR; }
static int cuda_sync(gpudata *b) { cuda_context *ctx = (cuda_context *)b->ctx; int err = GA_NO_ERROR; ASSERT_BUF(b); cuda_enter(ctx); ctx->err = cuEventSynchronize(b->wev); if (ctx->err != CUDA_SUCCESS) err = GA_IMPL_ERROR; ctx->err = cuEventSynchronize(b->rev); if (ctx->err != CUDA_SUCCESS) err = GA_IMPL_ERROR; cuda_exit(ctx); return err; }
static void _cuda_freekernel(gpukernel *k) { k->refcnt--; if (k->refcnt == 0) { if (k->ctx != NULL) { cuda_enter(k->ctx); cuModuleUnload(k->m); cuda_exit(k->ctx); cuda_free_ctx(k->ctx); } CLEAR(k); free(k->args); free(k->bin); free(k->types); free(k); } }
/** * \brief NCCL implementation of \ref gpucomm_reduce_scatter. */ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(get_count(comm, &ndev)); GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count * ndev, typecode, opcode, comm, &datatype, &op)); if (dest->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); resc_size = count * gpuarray_get_elsize(typecode); if ((dest->sz - offdest) < resc_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); assert(!(offdest > dest->sz)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
static void teardown(void *c) { cuda_context *ctx = (cuda_context *)c; blas_handle *handle = (blas_handle *)ctx->blas_handle; if (ctx->blas_handle == NULL) return; cuda_enter(ctx); cublasDestroy(handle->h); GpuKernel_clear(&handle->sgemvBH_N_a1_b1_small); GpuKernel_clear(&handle->sgemvBH_T_a1_b1_small); GpuKernel_clear(&handle->dgemvBH_N_a1_b1_small); GpuKernel_clear(&handle->dgemvBH_T_a1_b1_small); GpuKernel_clear(&handle->sgerBH_gen_small); GpuKernel_clear(&handle->dgerBH_gen_small); cuda_exit(ctx); free(ctx->blas_handle); ctx->blas_handle = NULL; }
int APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, PyGpuArrayObject *km, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **kerns, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*kerns); *kerns = km; Py_INCREF(*kerns); #else if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), km->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*kerns, km)) return 1; #endif if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO static int reuse_algo = 0; static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_top_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionBwdFilterAlgoPerf_t choice; err = cudnnFindConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " "information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_top_dims[i] = PyGpuArray_DIM(output, i); } #endif #endif // The FFT implementation does not support strides, 1x1 filters or inputs // with a spatial dimension larger than 1024. // If the chosen implementation is FFT, validate that it can // be used on the current data and default to a safe implementation if it // can't. // The following code is 2d-specific but it is fine as FFT and tiled-FFT are // defined only for 2d filters if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride, upscale, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; } } size_t worksize; gpudata *workspace; err = cudnnGetConvolutionBackwardFilterWorkspaceSize( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); err = cudnnConvolutionBackwardFilter_v3( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); if (worksize != 0) c->ops->buffer_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, PyGpuArrayObject *om, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **output, PARAMS_TYPE* params) { PyGpuContextObject *c = input->context; void *alpha_p; void *beta_p; float af = alpha, bf = beta; cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size"); return 1; } if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) { PyErr_SetString(PyExc_ValueError, "Number of filters must be divisible by number of groups"); return 1; } switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } if (params->inplace) { Py_XDECREF(*output); *output = om; Py_INCREF(*output); } else { if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), om->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*output, om)) return 1; } if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) { int err2 = GpuArray_memset(&(*output)->ga, 0); if (err2 != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv could not fill the output with zeros: %d", err2); return 1; } return 0; } if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) return 1; if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1) return 1; size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups; size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups; cudnnConvolutionFwdAlgo_t algo = params->conv_algo; #ifdef DEBUG char algorithm_name[128]; #endif cuda_enter(c->ctx); if (params->choose_algo) { if (!params->choose_once) { reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); } } if (!reuse_algo) { size_t free; int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); if (err2 != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " "memory information on the GPU"); cuda_exit(c->ctx); return 1; } // Guess 4Mb if the info is not available if (free == 0) free = 4 * 1024 * 1024; if (params->choose_time) { int count; cudnnConvolutionFwdAlgoPerf_t choice; gpudata *tmpmem; tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); if (tmpmem == NULL) { PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); return -1; } // We don't sync the buffer as we don't care about the values. err = cudnnFindConvolutionForwardAlgorithmEx( params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output), 1, &count, &choice, *(void **)tmpmem, free); gpudata_release(tmpmem); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #ifdef DEBUG if (count == 0) { PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found"); return 1; } else if (choice.status != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting best-timed FWD algo: %s", cudnnGetErrorString(choice.status)); return 1; } // Else, count is necessarly 1 for current implementation. #endif } else { err = cudnnGetConvolutionForwardAlgorithm( params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } } prev_algo = algo; } else { algo = prev_algo; } #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; // NB: This is printed only when algorithm is chosen at runtime. if (reuse_algo) fprintf(stderr, "(reused %s)\n", algorithm_name); else fprintf(stderr, "(using %s)\n", algorithm_name); #endif if (params->choose_once) { reuse_algo = 1; } else { for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); } } } /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */ if (PyGpuArray_NDIM(input) == 5 && !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING)) { #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1. // Issue should be resolved for cuDNN > V6.0. if (cudnnGetVersion() < 6100 && algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM && PyGpuArray_DIM(input, 0) > 65536) { #ifdef DEBUG fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM " "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n"); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } // The FFT implementation does not support strides, 1x1 filters or inputs // with a spatial dimension larger than 1024. The tiled-FFT implementation // does not support strides. // If the chosen implementation is FFT or tiled-FFT, validate that it can // be used on the current data and default to a safe implementation if it // can't. // The following code is 2d-specific but it is fine as FFT and tiled-FFT are // defined only for 2d filters /* NB: TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1. New conditions apply and may depend on number of dimensions (2D or 3D) e.g. for FFT_TILING. TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms? */ if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int dilation[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } else { // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING if (stride[0] != 1 || stride[1] != 1) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } } { size_t worksize; gpudata *workspace; err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); if (err == CUDNN_STATUS_NOT_SUPPORTED) { // Fallback to none algo if not supported #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; fprintf(stderr, "(%s error getting worksize: " "fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } /* * This is less than ideal since we need to free it after (which * introduces a synchronization point. But we don't have a module * to place a nice get_work_mem() function in. */ if (worksize != 0) { workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); for ( int g = 0; g < params->num_groups; g++) { err = cudnnConvolutionForward( params->handle, alpha_p, APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g, APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g); } if (worksize != 0) gpudata_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); } cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
static gpukernel *cuda_newkernel(void *c, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, const int *types, int flags, int *ret, char **err_str) { cuda_context *ctx = (cuda_context *)c; strb sb = STRB_STATIC_INIT; char *bin, *log = NULL; srckey k, *ak; binval *av; gpukernel *res; size_t bin_len = 0, log_len = 0; CUdevice dev; unsigned int i; int ptx_mode = 0; int binary_mode = 0; int major, minor; if (count == 0) FAIL(NULL, GA_VALUE_ERROR); if (flags & GA_USE_OPENCL) FAIL(NULL, GA_DEVSUP_ERROR); if (flags & GA_USE_BINARY) { // GA_USE_BINARY is exclusive if (flags & ~GA_USE_BINARY) FAIL(NULL, GA_INVALID_ERROR); // We need the length for binary data and there is only one blob. if (count != 1 || lengths == NULL || lengths[0] == 0) FAIL(NULL, GA_VALUE_ERROR); } cuda_enter(ctx); ctx->err = cuCtxGetDevice(&dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuDeviceComputeCapability(&major, &minor, dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } // GA_USE_CLUDA is done later // GA_USE_SMALL will always work if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } } if (flags & GA_USE_COMPLEX) { // just for now since it is most likely broken cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } // GA_USE_HALF should always work if (flags & GA_USE_PTX) { ptx_mode = 1; } else if (flags & GA_USE_BINARY) { binary_mode = 1; } if (binary_mode) { bin = memdup(strings[0], lengths[0]); bin_len = lengths[0]; if (bin == NULL) { cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } } else { if (flags & GA_USE_CLUDA) { strb_appends(&sb, CUDA_PREAMBLE); } if (lengths == NULL) { for (i = 0; i < count; i++) strb_appends(&sb, strings[i]); } else { for (i = 0; i < count; i++) { if (lengths[i] == 0) strb_appends(&sb, strings[i]); else strb_appendn(&sb, strings[i], lengths[i]); } } strb_append0(&sb); if (strb_error(&sb)) { strb_clear(&sb); cuda_exit(ctx); return NULL; } if (ptx_mode) { bin = sb.s; bin_len = sb.l; } else { bin = NULL; if (compile_cache != NULL) { k.src = sb.s; k.len = sb.l; memcpy(k.arch, ctx->bin_id, BIN_ID_LEN); av = cache_get(compile_cache, &k); if (av != NULL) { bin = memdup(av->bin, av->len); bin_len = av->len; } } if (bin == NULL) { bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len, &log, &log_len, ret); } if (bin == NULL) { if (err_str != NULL) { strb debug_msg = STRB_STATIC_INIT; // We're substituting debug_msg for a string with this first line: strb_appends(&debug_msg, "CUDA kernel build failure ::\n"); /* Delete the final NUL */ sb.l--; gpukernel_source_with_line_numbers(1, (const char **)&sb.s, &sb.l, &debug_msg); if (log != NULL) { strb_appends(&debug_msg, "\nCompiler log:\n"); strb_appendn(&debug_msg, log, log_len); free(log); } *err_str = strb_cstr(&debug_msg); // *err_str will be free()d by the caller (see docs in kernel.h) } strb_clear(&sb); cuda_exit(ctx); return NULL; } if (compile_cache == NULL) compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free, bin_free); if (compile_cache != NULL) { ak = malloc(sizeof(*ak)); av = malloc(sizeof(*av)); if (ak == NULL || av == NULL) { free(ak); free(av); goto done; } ak->src = memdup(sb.s, sb.l); if (ak->src == NULL) { free(ak); free(av); goto done; } ak->len = sb.l; memmove(ak->arch, ctx->bin_id, BIN_ID_LEN); av->len = bin_len; av->bin = memdup(bin, bin_len); if (av->bin == NULL) { src_free(ak); free(av); goto done; } cache_add(compile_cache, ak, av); } done: strb_clear(&sb); } } res = calloc(1, sizeof(*res)); if (res == NULL) { free(bin); cuda_exit(ctx); FAIL(NULL, GA_SYS_ERROR); } res->bin_sz = bin_len; res->bin = bin; res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } memcpy(res->types, types, argcount*sizeof(int)); res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } ctx->err = cuModuleLoadData(&res->m, bin); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; cuda_exit(ctx); TAG_KER(res); return res; }
static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { cuda_context *ctx = NULL; if (c != NULL) { ctx = (cuda_context *)c; ASSERT_CTX(ctx); } else if (buf != NULL) { ASSERT_BUF(buf); ctx = buf->ctx; } else if (k != NULL) { ASSERT_KER(k); ctx = k->ctx; } /* I know that 512 and 1024 are magic numbers. There is an indication in buffer.h, though. */ if (prop_id < 512) { if (ctx == NULL) return GA_VALUE_ERROR; } else if (prop_id < 1024) { if (buf == NULL) return GA_VALUE_ERROR; } else { if (k == NULL) return GA_VALUE_ERROR; } switch (prop_id) { char *s; CUdevice id; int i; size_t sz; case GA_CTX_PROP_DEVNAME: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } /* 256 is what the CUDA API uses so it's good enough for me */ s = malloc(256); if (s == NULL) { cuda_exit(ctx); return GA_MEMORY_ERROR; } ctx->err = cuDeviceGetName(s, 256, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((char **)res) = s; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_LMEMSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((unsigned int *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_BLAS_OPS: #ifdef WITH_CUDA_CUBLAS *((gpuarray_blas_ops **)res) = &cublas_ops; return GA_NO_ERROR; #else *((void **)res) = NULL; return GA_DEVSUP_ERROR; #endif case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; return GA_NO_ERROR; case GA_CTX_PROP_ERRBUF: *((gpudata **)res) = ctx->errbuf; return GA_NO_ERROR; case GA_CTX_PROP_TOTAL_GMEM: cuda_enter(ctx); ctx->err = cuMemGetInfo(&sz, (size_t *)res); cuda_exit(ctx); return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; case GA_CTX_PROP_FREE_GMEM: cuda_enter(ctx); ctx->err = cuMemGetInfo((size_t *)res, &sz); cuda_exit(ctx); return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; case GA_BUFFER_PROP_REFCNT: *((unsigned int *)res) = buf->refcnt; return GA_NO_ERROR; case GA_BUFFER_PROP_SIZE: *((size_t *)res) = buf->sz; return GA_NO_ERROR; case GA_BUFFER_PROP_CTX: case GA_KERNEL_PROP_CTX: *((void **)res) = (void *)ctx; return GA_NO_ERROR; case GA_KERNEL_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k); cuda_exit(ctx); if (ctx->err != CUDA_SUCCESS) return GA_IMPL_ERROR; *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_PREFLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_exit(ctx); *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_NUMARGS: *((unsigned int *)res) = k->argcount; return GA_NO_ERROR; case GA_KERNEL_PROP_TYPES: *((const int **)res) = k->types; return GA_NO_ERROR; default: return GA_INVALID_ERROR; } }