static int cuda_share(gpudata *a, gpudata *b, int *ret) { ASSERT_BUF(a); ASSERT_BUF(b); return (a->ctx == b->ctx && a->sz != 0 && b->sz != 0 && ((a->ptr <= b->ptr && a->ptr + a->sz > b->ptr) || (b->ptr <= a->ptr && b->ptr + b->sz > a->ptr))); }
static int cuda_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz) { cuda_context *ctx = dst->ctx; int res = GA_NO_ERROR; ASSERT_BUF(dst); ASSERT_BUF(src); if (src->ctx != dst->ctx) return GA_VALUE_ERROR; if (sz == 0) return GA_NO_ERROR; if ((dst->sz - dstoff) < sz || (src->sz - srcoff) < sz) return GA_VALUE_ERROR; cuda_enter(ctx); cuda_wait(src, CUDA_WAIT_READ); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemcpyDtoDAsync(dst->ptr + dstoff, src->ptr + srcoff, sz, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_record(src, CUDA_WAIT_READ); cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return res; }
/** * \brief NCCL implementation of \ref gpucomm_all_reduce. */ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(check_restrictions(src, offsrc, dest, offdest, count, typecode, opcode, comm, &datatype, &op)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; gpudata *T; size_t t; cublasStatus_t err; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, ((double *)A->ptr) + offA, lda, ((double *)B->ptr) + offB, ldb, &beta, ((double *)C->ptr) + offC, ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(B, CUDA_WAIT_READ); cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *td; size_t t; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(A); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; if (order == cb_c) { t = M; M = N; N = t; t = offX; offX = offY; offY = t; t = incX; incX = incY; incY = t; td = X; X = Y; Y = td; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL)); h->err = cublasDger(h->h, M, N, &alpha, ((double *)X->ptr) + offX, incX, ((double *)Y->ptr) + offY, incY, ((double *)A->ptr) + offA, lda); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL)); h->err = cublasDgemv(h->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; gpudata *td; size_t t; cublasStatus_t err; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(A); if (order == cb_c) { t = M; M = N; N = t; t = offX; offX = offY; offY = t; t = incX; incX = incY; incY = t; td = X; X = Y; Y = td; } cuda_enter(ctx); cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ); cuda_wait(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDger(((blas_handle *)ctx->blas_handle)->h, M, N, &alpha, ((double *)X->ptr) + offX, incX, ((double *)Y->ptr) + offY, incY, ((double *)A->ptr) + offA, lda); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(X, CUDA_WAIT_READ); cuda_record(Y, CUDA_WAIT_READ); cuda_record(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; cublasStatus_t err; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemv(((blas_handle *)ctx->blas_handle)->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(X, CUDA_WAIT_READ); cuda_record(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int cuda_read(void *dst, gpudata *src, size_t srcoff, size_t sz) { cuda_context *ctx = src->ctx; ASSERT_BUF(src); if (sz == 0) return GA_NO_ERROR; if ((src->sz - srcoff) < sz) return GA_VALUE_ERROR; cuda_enter(ctx); if (src->flags & CUDA_MAPPED_PTR) { ctx->err = cuEventSynchronize(src->wev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } memcpy(dst, (void *)(src->ptr + srcoff), sz); } else { cuda_waits(src, CUDA_WAIT_READ, ctx->mem_s); ctx->err = cuMemcpyDtoHAsync(dst, src->ptr + srcoff, sz, ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_records(src, CUDA_WAIT_READ, ctx->mem_s); } cuda_exit(ctx); return GA_NO_ERROR; }
static int cuda_write(gpudata *dst, size_t dstoff, const void *src, size_t sz) { cuda_context *ctx = dst->ctx; ASSERT_BUF(dst); if (sz == 0) return GA_NO_ERROR; if ((dst->sz - dstoff) < sz) return GA_VALUE_ERROR; cuda_enter(ctx); if (dst->flags & CUDA_MAPPED_PTR) { ctx->err = cuEventSynchronize(dst->rev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } memcpy((void *)(dst->ptr + dstoff), src, sz); } else { cuda_waits(dst, CUDA_WAIT_WRITE, ctx->mem_s); ctx->err = cuMemcpyHtoDAsync(dst->ptr + dstoff, src, sz, ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_records(dst, CUDA_WAIT_WRITE, ctx->mem_s); } cuda_exit(ctx); return GA_NO_ERROR; }
/** * \brief NCCL implementation of \ref gpucomm_broadcast. */ static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up ncclDataType_t datatype = ncclNumTypes; int rank = 0; cuda_context *ctx; ASSERT_BUF(array); ASSERT_COMM(comm); GA_CHECK(check_restrictions(array, offset, NULL, 0, count, typecode, 0, comm, &datatype, NULL)); GA_CHECK(get_rank(comm, &rank)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_READ)); else GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void *)(array->ptr + offset), count, datatype, root, comm->c, ctx->s)); if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_READ)); else GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
static void cuda_free(gpudata *d) { /* We ignore errors on free */ ASSERT_BUF(d); d->refcnt--; if (d->refcnt == 0) { /* Keep a reference to the context since we deallocate the gpudata * object */ cuda_context *ctx = d->ctx; if (d->flags & DONTFREE) { /* This is the path for "external" buffers */ deallocate(d); } else if (ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE) { /* Just free the pointer */ cuMemFree(d->ptr); deallocate(d); } else { /* Find the position in the freelist. Freelist is kept in order of allocation address */ gpudata *next = d->ctx->freeblocks, *prev = NULL; for (; next && next->ptr < d->ptr; next = next->next) { prev = next; } next = prev != NULL ? prev->next : d->ctx->freeblocks; /* See if we can merge the block with the previous one */ if (!(d->flags & CUDA_HEAD_ALLOC) && prev != NULL && prev->ptr + prev->sz == d->ptr) { prev->sz = prev->sz + d->sz; cuda_wait(d, CUDA_WAIT_ALL); cuda_record(prev, CUDA_WAIT_ALL); deallocate(d); d = prev; } else if (prev != NULL) { prev->next = d; } else { d->ctx->freeblocks = d; } /* See if we can merge with next */ if (next && !(next->flags & CUDA_HEAD_ALLOC) && d->ptr + d->sz == next->ptr) { d->sz = d->sz + next->sz; d->next = next->next; cuda_wait(next, CUDA_WAIT_ALL); cuda_record(d, CUDA_WAIT_ALL); deallocate(next); } else { d->next = next; } } /* We keep this at the end since the freed buffer could be the * last reference to the context and therefore clearing the * reference could trigger the freeing if the whole context * including the freelist, which we manipulate. */ cuda_free_ctx(ctx); } }
/** * \brief NCCL implementation of \ref gpucomm_reduce_scatter. */ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(get_count(comm, &ndev)); GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count * ndev, typecode, opcode, comm, &datatype, &op)); if (dest->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); resc_size = count * gpuarray_get_elsize(typecode); if ((dest->sz - offdest) < resc_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); assert(!(offdest > dest->sz)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
static int cuda_records(gpudata *a, int flags, CUstream s) { ASSERT_BUF(a); cuda_enter(a->ctx); if (flags & CUDA_WAIT_READ) a->ctx->err = cuEventRecord(a->rev, s); if (flags & CUDA_WAIT_WRITE) a->ctx->err = cuEventRecord(a->wev, s); cuda_exit(a->ctx); return GA_NO_ERROR; }
static gpudata *cuda_transfer(gpudata *src, size_t offset, size_t sz, void *dst_c, int may_share) { cuda_context *ctx = src->ctx; cuda_context *dst_ctx = (cuda_context *)dst_c; gpudata *dst; ASSERT_BUF(src); ASSERT_CTX(ctx); ASSERT_CTX(dst_ctx); if (ctx == dst_ctx) { if (may_share && offset == 0) { cuda_retain(src); return src; } dst = cuda_alloc(ctx, sz, NULL, 0, NULL); if (dst == NULL) return NULL; cuda_enter(ctx); cuda_wait(src, CUDA_WAIT_READ); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemcpyDtoDAsync(dst->ptr, src->ptr+offset, sz, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); cuda_free(dst); return NULL; } cuda_record(src, CUDA_WAIT_READ); cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return dst; } dst = cuda_alloc(dst_ctx, sz, NULL, 0, NULL); if (dst == NULL) return NULL; cuda_enter(ctx); cuda_waits(src, CUDA_WAIT_READ, dst_ctx->mem_s); cuda_waits(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s); ctx->err = cuMemcpyPeerAsync(dst->ptr, dst->ctx->ctx, src->ptr+offset, src->ctx->ctx, sz, dst_ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_free(dst); cuda_exit(ctx); return NULL; } cuda_records(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s); cuda_records(src, CUDA_WAIT_READ, dst_ctx->mem_s); cuda_exit(ctx); return dst; }
static int cuda_sync(gpudata *b) { cuda_context *ctx = (cuda_context *)b->ctx; int err = GA_NO_ERROR; ASSERT_BUF(b); cuda_enter(ctx); ctx->err = cuEventSynchronize(b->wev); if (ctx->err != CUDA_SUCCESS) err = GA_IMPL_ERROR; ctx->err = cuEventSynchronize(b->rev); if (ctx->err != CUDA_SUCCESS) err = GA_IMPL_ERROR; cuda_exit(ctx); return err; }
static int cuda_memset(gpudata *dst, size_t dstoff, int data) { cuda_context *ctx = dst->ctx; ASSERT_BUF(dst); if ((dst->sz - dstoff) == 0) return GA_NO_ERROR; cuda_enter(ctx); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemsetD8Async(dst->ptr + dstoff, data, dst->sz - dstoff, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int cuda_waits(gpudata *a, int flags, CUstream s) { ASSERT_BUF(a); /* If others are only reads, no need to wait */ cuda_enter(a->ctx); if (flags & CUDA_WAIT_READ) { /* We wait for writes that happened before since multiple reads at * the same time are fine */ a->ctx->err = cuStreamWaitEvent(s, a->wev, 0); if (a->ctx->err != CUDA_SUCCESS) { cuda_exit(a->ctx); return GA_IMPL_ERROR; } } if (flags & CUDA_WAIT_WRITE) { /* Make sure to not disturb previous reads */ a->ctx->err = cuStreamWaitEvent(s, a->rev, 0); if (a->ctx->err != CUDA_SUCCESS) { cuda_exit(a->ctx); return GA_IMPL_ERROR; } } cuda_exit(a->ctx); return GA_NO_ERROR; }
static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { cuda_context *ctx; size_t t, i; size_t ls[2], gs[2]; void *args[9]; gpudata *Aa, *xa, *ya; int err; if (flags != 0) return GA_INVALID_ERROR; if (batchCount == 0) return GA_NO_ERROR; if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR; if (M < 512) { ls[0] = 32; if (batchCount > 16) ls[1] = 16; else ls[1] = batchCount; } else { ls[0] = 512; ls[1] = 1; } gs[0] = (M + ls[0] - 1) / ls[0]; gs[1] = (batchCount + ls[1] - 1) / ls[1]; if (gs[0] * gs[1] / 65535) { gs[1] = (65535 / gs[0]); } if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } ASSERT_BUF(A[0]); ctx = A[0]->ctx; cuda_enter(ctx); { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **x_l = (const double **)T_l + batchCount; double **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(x[i], CUDA_WAIT_READ); cuda_wait(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); A_l[i] = (double *)(A[i]->ptr + offA[i]); x_l[i] = (double *)(x[i]->ptr + offX[i]); y_l[i] = (double *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l, GA_BUFFER_INIT, &err); if (Aa == NULL) return err; xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l, GA_BUFFER_INIT, &err); if (xa == NULL) { cuda_ops.buffer_release(Aa); return err; } ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l, GA_BUFFER_INIT, &err); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return err; } } args[0] = Aa; args[1] = &lda; args[2] = xa; args[3] = &incX; args[4] = ya; args[5] = &incY; args[6] = &batchCount; args[7] = &M; args[8] = &N; if (transA == cb_no_trans) { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args); } else { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args); } cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(x[i], CUDA_WAIT_READ); cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *T; size_t t; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); h->err = cublasDgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((double *)A->ptr) + offA, lda, ((double *)B->ptr) + offB, ldb, &beta, ((double *)C->ptr) + offC, ldc); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; blas_handle *h; size_t *lt, t; gpudata **T; size_t i; const size_t threshold = 650; cb_transpose transT; if (batchCount == 0) return GA_NO_ERROR; if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; ASSERT_BUF(A[0]); ctx = A[0]->ctx; h = (blas_handle *)ctx->blas_handle; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } /* use parallel cublasSgemm calls rather than cublasSgemmBatched for * large products */ if (M * N * K > threshold * threshold * threshold) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); h->err = cublasSgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, (float*)A[i]->ptr + offA[i], lda, (float*)B[i]->ptr + offB[i], ldb, &beta, (float*)C[i]->ptr + offC[i], ldc); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL)); } } else { float **T_l = alloca(sizeof(float *) * batchCount * 3); const float **A_l = (const float **)T_l; const float **B_l = (const float **)T_l + batchCount; float **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); A_l[i] = ((float *)A[i]->ptr) + offA[i]; B_l[i] = ((float *)B[i]->ptr) + offB[i]; C_l[i] = ((float *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(float *) * batchCount * 3); Aa = Ta; Ba = Ta + (batchCount * sizeof(float *)); Ca = Ta + (batchCount * sizeof(float *) * 2); cuMemcpyHtoD(Ta, T_l, sizeof(float *) * batchCount * 3); h->err = cublasSgemmBatched(h->h, convT(transA), convT(transB), M, N, K, &alpha, (const float **)Aa, lda, (const float **)Ba, ldb, &beta, (float **)Ca, ldc, batchCount); cuMemFree(Ta); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } for (i = 0; i < batchCount; i++) { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL)); } } cuda_exit(ctx); return GA_NO_ERROR; }
CUdeviceptr cuda_get_ptr(gpudata *g) { ASSERT_BUF(g); return g->ptr; }
static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; size_t t, *tp, i; size_t ls[3] = {M, N, 1}, gs[3] = {1, 1, batchCount}; void *args[10]; gpudata **T; gpudata *Aa, *xa, *ya; int err; if (flags != 0) return GA_INVALID_ERROR; if (batchCount == 0) return GA_NO_ERROR; if (incX == 1) { if (ls[0] > 32) { gs[0] = (ls[0] + 31) / 32; ls[0] = 32; } if (ls[0] * ls[1] > 512) { gs[1] = (ls[1] + 15) / 16; ls[1] = 16; } } else { if (ls[1] > 32) { gs[1] = (ls[1] + 31) / 32; ls[1] = 32; } if (ls[0] * ls[1] > 512) { gs[0] = (ls[0] + 15) / 16; ls[0] = 16; } } if (gs[0] * gs[1] * gs[2] > 65535) { if (gs[0] * gs[1] > 65535) return GA_VALUE_ERROR; gs[2] = (65535 / (gs[0] * gs[1])); } if (order == cb_c) { t = M; M = N; N = t; tp = offX; offX = offY; offY = tp; t = incX; incX = incY; incY = t; T = x; x = y; y = T; } ASSERT_BUF(x[0]); ctx = x[0]->ctx; cuda_enter(ctx); { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **x_l = (const double **)T_l + batchCount; double **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); cuda_wait(A[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_wait(x[i], CUDA_WAIT_READ); cuda_wait(y[i], CUDA_WAIT_READ); A_l[i] = (double *)(A[i]->ptr + offA[i]); x_l[i] = (double *)(x[i]->ptr + offX[i]); y_l[i] = (double *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l, GA_BUFFER_INIT, &err); if (Aa == NULL) return err; xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l, GA_BUFFER_INIT, &err); if (xa == NULL) { cuda_ops.buffer_release(Aa); return err; } ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l, GA_BUFFER_INIT, &err); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return err; } } args[0] = xa; args[1] = &incX; args[2] = ya; args[3] = &incY; args[4] = α args[5] = Aa; args[6] = &lda; args[7] = &batchCount; args[8] = &M; args[9] = &N; err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args); cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(x[i], CUDA_WAIT_READ); cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } cuda_exit(ctx); return GA_NO_ERROR; }
static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { #ifdef HAVE_CUBLAS_SGEMMEX /* This will use float32 for computation as it's the best we can * have right now. In the future when native float16 support will be * there we will switch to that. */ cuda_context *ctx = A->ctx; gpudata *T; size_t t; cublasStatus_t err; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasSgemmEx(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, ((uint16_t *)A->ptr) + offA, CUBLAS_DATA_HALF, lda, ((uint16_t *)B->ptr) + offB, CUBLAS_DATA_HALF, ldb, &beta, ((uint16_t *)C->ptr) + offC, CUBLAS_DATA_HALF, ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(B, CUDA_WAIT_READ); cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; #else return GA_DEVSUP_ERROR; #endif }
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; size_t *lt, t; gpudata **T; size_t i; cb_transpose transT; cublasStatus_t err; if (batchCount == 0) return GA_NO_ERROR; ASSERT_BUF(A[0]); ctx = A[0]->ctx; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products const size_t threshold = 650; const int multiple_dispatch = M * N * K > threshold * threshold * threshold; if (multiple_dispatch) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (double*)A[i]->ptr + offA[i], lda, (double*)B[i]->ptr + offB[i], ldb, &beta, (double*)C[i]->ptr + offC[i], ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } else { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **B_l = (const double **)T_l + batchCount; double **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); A_l[i] = ((double *)A[i]->ptr) + offA[i]; B_l[i] = ((double *)B[i]->ptr) + offB[i]; C_l[i] = ((double *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3); Aa = Ta; Ba = Ta + (batchCount * sizeof(double *)); Ca = Ta + (batchCount * sizeof(double *) * 2); cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3); err = cublasDgemmBatched(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (const double **)Aa, lda, (const double **)Ba, ldb, &beta, (double **)Ca, ldc, batchCount); cuMemFree(Ta); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } cuda_exit(ctx); return GA_NO_ERROR; }
static void cuda_retain(gpudata *d) { ASSERT_BUF(d); d->refcnt++; }
size_t cuda_get_sz(gpudata *g) { ASSERT_BUF(g); return g->sz; }
static int cuda_property(void *c, gpudata *buf, gpukernel *k, int prop_id, void *res) { cuda_context *ctx = NULL; if (c != NULL) { ctx = (cuda_context *)c; ASSERT_CTX(ctx); } else if (buf != NULL) { ASSERT_BUF(buf); ctx = buf->ctx; } else if (k != NULL) { ASSERT_KER(k); ctx = k->ctx; } /* I know that 512 and 1024 are magic numbers. There is an indication in buffer.h, though. */ if (prop_id < 512) { if (ctx == NULL) return GA_VALUE_ERROR; } else if (prop_id < 1024) { if (buf == NULL) return GA_VALUE_ERROR; } else { if (k == NULL) return GA_VALUE_ERROR; } switch (prop_id) { char *s; CUdevice id; int i; size_t sz; case GA_CTX_PROP_DEVNAME: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } /* 256 is what the CUDA API uses so it's good enough for me */ s = malloc(256); if (s == NULL) { cuda_exit(ctx); return GA_MEMORY_ERROR; } ctx->err = cuDeviceGetName(s, 256, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((char **)res) = s; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_LMEMSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_NUMPROCS: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((unsigned int *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_MAXGSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } *((size_t *)res) = i; cuda_exit(ctx); return GA_NO_ERROR; case GA_CTX_PROP_BLAS_OPS: #ifdef WITH_CUDA_CUBLAS *((gpuarray_blas_ops **)res) = &cublas_ops; return GA_NO_ERROR; #else *((void **)res) = NULL; return GA_DEVSUP_ERROR; #endif case GA_CTX_PROP_BIN_ID: *((const char **)res) = ctx->bin_id; return GA_NO_ERROR; case GA_CTX_PROP_ERRBUF: *((gpudata **)res) = ctx->errbuf; return GA_NO_ERROR; case GA_CTX_PROP_TOTAL_GMEM: cuda_enter(ctx); ctx->err = cuMemGetInfo(&sz, (size_t *)res); cuda_exit(ctx); return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; case GA_CTX_PROP_FREE_GMEM: cuda_enter(ctx); ctx->err = cuMemGetInfo((size_t *)res, &sz); cuda_exit(ctx); return ctx->err == CUDA_SUCCESS ? GA_NO_ERROR : GA_IMPL_ERROR; case GA_BUFFER_PROP_REFCNT: *((unsigned int *)res) = buf->refcnt; return GA_NO_ERROR; case GA_BUFFER_PROP_SIZE: *((size_t *)res) = buf->sz; return GA_NO_ERROR; case GA_BUFFER_PROP_CTX: case GA_KERNEL_PROP_CTX: *((void **)res) = (void *)ctx; return GA_NO_ERROR; case GA_KERNEL_PROP_MAXLSIZE: cuda_enter(ctx); ctx->err = cuFuncGetAttribute(&i, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, k->k); cuda_exit(ctx); if (ctx->err != CUDA_SUCCESS) return GA_IMPL_ERROR; *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_PREFLSIZE: cuda_enter(ctx); ctx->err = cuCtxGetDevice(&id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } ctx->err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_WARP_SIZE, id); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_exit(ctx); *((size_t *)res) = i; return GA_NO_ERROR; case GA_KERNEL_PROP_NUMARGS: *((unsigned int *)res) = k->argcount; return GA_NO_ERROR; case GA_KERNEL_PROP_TYPES: *((const int **)res) = k->types; return GA_NO_ERROR; default: return GA_INVALID_ERROR; } }
static int cuda_extcopy(gpudata *input, size_t ioff, gpudata *output, size_t ooff, int intype, int outtype, unsigned int a_nd, const size_t *a_dims, const ssize_t *a_str, unsigned int b_nd, const size_t *b_dims, const ssize_t *b_str) { cuda_context *ctx = input->ctx; void *args[2]; int res = GA_SYS_ERROR; unsigned int i; size_t nEls = 1, ls, gs; gpukernel *k; extcopy_args a, *aa; ASSERT_BUF(input); ASSERT_BUF(output); if (input->ctx != output->ctx) return GA_INVALID_ERROR; for (i = 0; i < a_nd; i++) { nEls *= a_dims[i]; } if (nEls == 0) return GA_NO_ERROR; a.ind = a_nd; a.ond = b_nd; a.itype = intype; a.otype = outtype; a.ioff = ioff; a.ooff = ooff; a.idims = a_dims; a.odims = b_dims; a.istr = a_str; a.ostr = b_str; k = cache_get(ctx->extcopy_cache, &a); if (k == NULL) { res = gen_extcopy_kernel(&a, input->ctx, &k, nEls); if (res != GA_NO_ERROR) return res; /* Cache the kernel */ aa = memdup(&a, sizeof(a)); if (aa == NULL) goto done; aa->idims = memdup(a_dims, a_nd*sizeof(size_t)); aa->odims = memdup(b_dims, b_nd*sizeof(size_t)); aa->istr = memdup(a_str, a_nd*sizeof(ssize_t)); aa->ostr = memdup(b_str, b_nd*sizeof(ssize_t)); if (aa->idims == NULL || aa->odims == NULL || aa->istr == NULL || aa->ostr == NULL) { extcopy_free(aa); goto done; } /* One ref is given to the cache, we manage the other */ cuda_retainkernel(k); cache_add(ctx->extcopy_cache, aa, k); } else { /* This is our reference */ cuda_retainkernel(k); } done: /* Cheap kernel scheduling */ res = cuda_property(NULL, NULL, k, GA_KERNEL_PROP_MAXLSIZE, &ls); if (res != GA_NO_ERROR) goto fail; gs = ((nEls-1) / ls) + 1; args[0] = input; args[1] = output; res = cuda_callkernel(k, 1, &ls, &gs, 0, args); fail: /* We free our reference here */ cuda_freekernel(k); return res; }