static int cuda_move(gpudata *dst, size_t dstoff, gpudata *src, size_t srcoff, size_t sz) { cuda_context *ctx = dst->ctx; int res = GA_NO_ERROR; ASSERT_BUF(dst); ASSERT_BUF(src); if (src->ctx != dst->ctx) return GA_VALUE_ERROR; if (sz == 0) return GA_NO_ERROR; if ((dst->sz - dstoff) < sz || (src->sz - srcoff) < sz) return GA_VALUE_ERROR; cuda_enter(ctx); cuda_wait(src, CUDA_WAIT_READ); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemcpyDtoDAsync(dst->ptr + dstoff, src->ptr + srcoff, sz, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_record(src, CUDA_WAIT_READ); cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return res; }
/** * \brief NCCL implementation of \ref gpucomm_broadcast. */ static int broadcast(gpudata *array, size_t offset, size_t count, int typecode, int root, gpucomm *comm) { // need dummy init so that compiler shuts up ncclDataType_t datatype = ncclNumTypes; int rank = 0; cuda_context *ctx; ASSERT_BUF(array); ASSERT_COMM(comm); GA_CHECK(check_restrictions(array, offset, NULL, 0, count, typecode, 0, comm, &datatype, NULL)); GA_CHECK(get_rank(comm, &rank)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_READ)); else GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(array, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclBcast((void *)(array->ptr + offset), count, datatype, root, comm->c, ctx->s)); if (rank == root) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_READ)); else GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(array, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
/** * \brief NCCL implementation of \ref gpucomm_all_reduce. */ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(check_restrictions(src, offsrc, dest, offdest, count, typecode, opcode, comm, &datatype, &op)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclAllReduce((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; gpudata *T; size_t t; cublasStatus_t err; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, ((double *)A->ptr) + offA, lda, ((double *)B->ptr) + offB, ldb, &beta, ((double *)C->ptr) + offC, ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(B, CUDA_WAIT_READ); cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static void cuda_free(gpudata *d) { /* We ignore errors on free */ ASSERT_BUF(d); d->refcnt--; if (d->refcnt == 0) { /* Keep a reference to the context since we deallocate the gpudata * object */ cuda_context *ctx = d->ctx; if (d->flags & DONTFREE) { /* This is the path for "external" buffers */ deallocate(d); } else if (ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE) { /* Just free the pointer */ cuMemFree(d->ptr); deallocate(d); } else { /* Find the position in the freelist. Freelist is kept in order of allocation address */ gpudata *next = d->ctx->freeblocks, *prev = NULL; for (; next && next->ptr < d->ptr; next = next->next) { prev = next; } next = prev != NULL ? prev->next : d->ctx->freeblocks; /* See if we can merge the block with the previous one */ if (!(d->flags & CUDA_HEAD_ALLOC) && prev != NULL && prev->ptr + prev->sz == d->ptr) { prev->sz = prev->sz + d->sz; cuda_wait(d, CUDA_WAIT_ALL); cuda_record(prev, CUDA_WAIT_ALL); deallocate(d); d = prev; } else if (prev != NULL) { prev->next = d; } else { d->ctx->freeblocks = d; } /* See if we can merge with next */ if (next && !(next->flags & CUDA_HEAD_ALLOC) && d->ptr + d->sz == next->ptr) { d->sz = d->sz + next->sz; d->next = next->next; cuda_wait(next, CUDA_WAIT_ALL); cuda_record(d, CUDA_WAIT_ALL); deallocate(next); } else { d->next = next; } } /* We keep this at the end since the freed buffer could be the * last reference to the context and therefore clearing the * reference could trigger the freeing if the whole context * including the freelist, which we manipulate. */ cuda_free_ctx(ctx); } }
static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *td; size_t t; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(A); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; if (order == cb_c) { t = M; M = N; N = t; t = offX; offX = offY; offY = t; t = incX; incX = incY; incY = t; td = X; X = Y; Y = td; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_ALL)); h->err = cublasDger(h->h, M, N, &alpha, ((double *)X->ptr) + offX, incX, ((double *)Y->ptr) + offY, incY, ((double *)A->ptr) + offA, lda); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static gpudata *cuda_transfer(gpudata *src, size_t offset, size_t sz, void *dst_c, int may_share) { cuda_context *ctx = src->ctx; cuda_context *dst_ctx = (cuda_context *)dst_c; gpudata *dst; ASSERT_BUF(src); ASSERT_CTX(ctx); ASSERT_CTX(dst_ctx); if (ctx == dst_ctx) { if (may_share && offset == 0) { cuda_retain(src); return src; } dst = cuda_alloc(ctx, sz, NULL, 0, NULL); if (dst == NULL) return NULL; cuda_enter(ctx); cuda_wait(src, CUDA_WAIT_READ); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemcpyDtoDAsync(dst->ptr, src->ptr+offset, sz, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); cuda_free(dst); return NULL; } cuda_record(src, CUDA_WAIT_READ); cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return dst; } dst = cuda_alloc(dst_ctx, sz, NULL, 0, NULL); if (dst == NULL) return NULL; cuda_enter(ctx); cuda_waits(src, CUDA_WAIT_READ, dst_ctx->mem_s); cuda_waits(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s); ctx->err = cuMemcpyPeerAsync(dst->ptr, dst->ctx->ctx, src->ptr+offset, src->ctx->ctx, sz, dst_ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_free(dst); cuda_exit(ctx); return NULL; } cuda_records(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s); cuda_records(src, CUDA_WAIT_READ, dst_ctx->mem_s); cuda_exit(ctx); return dst; }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(M * N) || LARGE_VAL(lda) || LARGE_VAL(incX) || LARGE_VAL(incY)) return GA_XLARGE_ERROR; if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(Y, CUDA_WAIT_ALL)); h->err = cublasDgemv(h->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(X, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(Y, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static int dger(cb_order order, size_t M, size_t N, double alpha, gpudata *X, size_t offX, int incX, gpudata *Y, size_t offY, int incY, gpudata *A, size_t offA, size_t lda) { cuda_context *ctx = X->ctx; gpudata *td; size_t t; cublasStatus_t err; ASSERT_BUF(X); ASSERT_BUF(Y); ASSERT_BUF(A); if (order == cb_c) { t = M; M = N; N = t; t = offX; offX = offY; offY = t; t = incX; incX = incY; incY = t; td = X; X = Y; Y = td; } cuda_enter(ctx); cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ); cuda_wait(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDger(((blas_handle *)ctx->blas_handle)->h, M, N, &alpha, ((double *)X->ptr) + offX, incX, ((double *)Y->ptr) + offY, incY, ((double *)A->ptr) + offA, lda); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(X, CUDA_WAIT_READ); cuda_record(Y, CUDA_WAIT_READ); cuda_record(A, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemv(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *X, size_t offX, int incX, double beta, gpudata *Y, size_t offY, int incY) { cuda_context *ctx = A->ctx; cublasStatus_t err; size_t t; ASSERT_BUF(A); ASSERT_BUF(X); ASSERT_BUF(Y); if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(X, CUDA_WAIT_READ); cuda_wait(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemv(((blas_handle *)ctx->blas_handle)->h, convT(transA), M, N, &alpha, ((double *)A->ptr) + offA, lda, ((double *)X->ptr) + offX, incX, &beta, ((double *)Y->ptr) + offY, incY); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(X, CUDA_WAIT_READ); cuda_record(Y, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
/* * Extract the `curr` block from the freelist, possibly splitting it * if it's too big for the requested size. The remaining block will * stay on the freelist if there is a split. `prev` is only to * facilitate the extraction so we don't have to go through the list * again. */ static int extract(gpudata *curr, gpudata *prev, size_t size) { gpudata *next, *split; size_t remaining = curr->sz - size; if (remaining < FRAG_SIZE) { /* No need to split, the remaining block would be too small */ next = curr->next; } else { split = new_gpudata(curr->ctx, curr->ptr + size, remaining); if (split == NULL) return GA_MEMORY_ERROR; /* Make sure the chain keeps going */ split->next = curr->next; curr->next = NULL; /* Make sure we don't start using the split buffer too soon */ cuda_wait(curr, CUDA_WAIT_ALL); cuda_record(split, CUDA_WAIT_ALL); next = split; curr->sz = size; } if (prev != NULL) prev->next = next; else curr->ctx->freeblocks = next; return GA_NO_ERROR; }
/** * \brief NCCL implementation of \ref gpucomm_reduce_scatter. */ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest, size_t count, int typecode, int opcode, gpucomm *comm) { // need dummy init so that compiler shuts up ncclRedOp_t op = ncclNumOps; ncclDataType_t datatype = ncclNumTypes; int ndev = 0; size_t resc_size; cuda_context *ctx; ASSERT_BUF(src); ASSERT_COMM(comm); ASSERT_BUF(dest); GA_CHECK(get_count(comm, &ndev)); GA_CHECK(check_restrictions(src, offsrc, NULL, 0, count * ndev, typecode, opcode, comm, &datatype, &op)); if (dest->ctx != comm->ctx) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination and comm context differ"); resc_size = count * gpuarray_get_elsize(typecode); if ((dest->sz - offdest) < resc_size) return error_set(comm->ctx->err, GA_VALUE_ERROR, "destination too small for operation"); assert(!(offdest > dest->sz)); ctx = comm->ctx; cuda_enter(ctx); // sync: wait till a write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(src, CUDA_WAIT_READ)); // sync: wait till a read/write has finished (out of concurrent kernels) GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(dest, CUDA_WAIT_WRITE)); // change stream of nccl ops to enable concurrency NCCL_EXIT_ON_ERROR(ctx, ncclReduceScatter((void *)(src->ptr + offsrc), (void *)(dest->ptr + offdest), count, datatype, op, comm->c, ctx->s)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE)); cuda_exit(ctx); return GA_NO_ERROR; }
static int cuda_callkernel(gpukernel *k, unsigned int n, const size_t *bs, const size_t *gs, size_t shared, void **args) { cuda_context *ctx = k->ctx; unsigned int i; ASSERT_KER(k); cuda_enter(ctx); for (i = 0; i < k->argcount; i++) { if (k->types[i] == GA_BUFFER) { /* We don't have any better info for now */ cuda_wait((gpudata *)args[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } switch (n) { case 1: ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared, ctx->s, args, NULL); break; case 2: ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared, ctx->s, args, NULL); break; case 3: ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2], shared, ctx->s, args, NULL); break; default: cuda_exit(ctx); return GA_VALUE_ERROR; } if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } for (i = 0; i < k->argcount; i++) { if (k->types[i] == GA_BUFFER) { /* We don't have any better info for now */ cuda_record((gpudata *)args[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } cuda_exit(ctx); return GA_NO_ERROR; }
static int cuda_memset(gpudata *dst, size_t dstoff, int data) { cuda_context *ctx = dst->ctx; ASSERT_BUF(dst); if ((dst->sz - dstoff) == 0) return GA_NO_ERROR; cuda_enter(ctx); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemsetD8Async(dst->ptr + dstoff, data, dst->sz - dstoff, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; }
static int hgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, float beta, gpudata *C, size_t offC, size_t ldc) { #ifdef HAVE_CUBLAS_SGEMMEX /* This will use float32 for computation as it's the best we can * have right now. In the future when native float16 support will be * there we will switch to that. */ cuda_context *ctx = A->ctx; gpudata *T; size_t t; cublasStatus_t err; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); cuda_wait(A, CUDA_WAIT_READ); cuda_wait(B, CUDA_WAIT_READ); cuda_wait(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasSgemmEx(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, ((uint16_t *)A->ptr) + offA, CUBLAS_DATA_HALF, lda, ((uint16_t *)B->ptr) + offB, CUBLAS_DATA_HALF, ldb, &beta, ((uint16_t *)C->ptr) + offC, CUBLAS_DATA_HALF, ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A, CUDA_WAIT_READ); cuda_record(B, CUDA_WAIT_READ); cuda_record(C, CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_exit(ctx); return GA_NO_ERROR; #else return GA_DEVSUP_ERROR; #endif }
int APPLY_SPECIFIC(ctc_cost_gpu)(PyGpuArrayObject * in_activations, PyArrayObject * in_labels, PyArrayObject * in_input_lengths, PyGpuArrayObject ** out_costs, PyGpuArrayObject ** out_gradients, PyGpuContextObject * gpu_context) { ctc_context_t ctc_object; ctc_context_t * context = &ctc_object; size_t gpu_workspace_size; int ctc_error = 0; const size_t num_activations = PyGpuArray_DIMS( in_activations )[0]; const size_t minibatch_size = PyGpuArray_DIMS( in_activations )[1]; const size_t alphabet_size = PyGpuArray_DIMS( in_activations )[2]; const size_t cost_size = minibatch_size; const size_t grad_dims[3] = { num_activations, minibatch_size, alphabet_size }; float * costs = NULL, * activations = NULL, * gradients = NULL; cuda_enter( gpu_context->ctx ); ctc_context_init( context, gpu_context ); switch (in_activations->ga.typecode) { case GA_FLOAT: activations = (float *) PyGpuArray_DEV_DATA( in_activations ); break; default: ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); PyErr_SetString( PyExc_TypeError, "GpuConnectionistTemporalClassification: Unsupported type for activations." ); return 1; } create_contiguous_input_lengths( in_input_lengths, &(context->input_lengths) ); if ( NULL == context->input_lengths ) { // Destroy previous CTC context before returning exception ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); PyErr_Format( PyExc_MemoryError, "GpuConnectionistTemporalClassification: Could not allocate memory for input lengths." ); return 1; } // flatten labels to conform with library memory layout create_flat_labels( in_labels, &(context->flat_labels), &(context->label_lengths) ); if ( ( NULL == context->label_lengths ) || ( NULL == context->flat_labels ) ) { // Destroy previous CTC context before returning exception ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); PyErr_Format( PyExc_MemoryError, "GpuConnectionistTemporalClassification: Could not allocate memory for labels and their lengths." ); return 1; } if ( theano_prep_output( out_costs, 1, &cost_size, in_activations->ga.typecode, GA_C_ORDER, gpu_context ) != 0 ) { ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); return 1; } GpuArray_memset( &((*out_costs)->ga), 0 ); costs = (float *) PyGpuArray_DEV_DATA( *out_costs ); if ( NULL != out_gradients ) // if gradient computation is not disabled { if ( theano_prep_output( out_gradients, 3, grad_dims, in_activations->ga.typecode, GA_C_ORDER, gpu_context ) != 0 ) { ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); return 1; } GpuArray_memset( &((*out_gradients)->ga), 0 ); gradients = (float *) PyGpuArray_DEV_DATA( *out_gradients ); } ctc_error = ctc_check_result( get_workspace_size( context->label_lengths, context->input_lengths, alphabet_size, minibatch_size, context->options, &gpu_workspace_size ), "Failed to obtain CTC workspace size." ); if ( ctc_error ) // Exception is set by ctc_check_result, return error here { // Destroy previous CTC context before returning exception ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); return 1; } context->workspace = gpudata_alloc( gpu_context->ctx, gpu_workspace_size, NULL, 0, NULL ); if ( NULL == context->workspace ) { ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); PyErr_Format( PyExc_MemoryError, "GpuConnectionistTemporalClassification: Failed to allocate memory for CTC workspace." ); return 1; } cuda_wait( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_wait( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); if ( out_gradients != NULL ) cuda_wait( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); ctc_error = ctc_check_result( compute_ctc_loss( activations, gradients, context->flat_labels, context->label_lengths, context->input_lengths, alphabet_size, minibatch_size, costs, *(void **)context->workspace, context->options ), "Failed to compute CTC loss function." ); cuda_record( in_activations->ga.data, GPUARRAY_CUDA_WAIT_READ ); cuda_record( (*out_costs)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); if ( out_gradients != NULL ) cuda_record( (*out_gradients)->ga.data, GPUARRAY_CUDA_WAIT_WRITE ); if ( ctc_error ) // Exception is set by ctc_check_result, return error here { ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); return 1; } ctc_context_destroy( context ); cuda_exit( gpu_context->ctx ); return 0; }
int APPLY_SPECIFIC(conv_gi)(PyGpuArrayObject *kerns, PyGpuArrayObject *output, PyGpuArrayObject *im, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **input, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(im)[1] != PyGpuArray_DIMS(kerns)[1]) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same " "stack size"); return 1; } if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; switch (im->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*input); *input = im; Py_INCREF(*input); #else if (theano_prep_output(input, PyGpuArray_NDIM(im), PyGpuArray_DIMS(im), im->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*input, im)) return 1; #endif if (c_set_tensorNd(*input, APPLY_SPECIFIC(input)) == -1) return 1; cudnnConvolutionBwdDataAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO static int reuse_algo = 0; static cudnnConvolutionBwdDataAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_kern_dims[5] = {0}; static size_t prev_top_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionBwdDataAlgoPerf_t choice; err = cudnnFindConvolutionBackwardDataAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess) { cudaGetLastError(); PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " "information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionBackwardDataAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(kerns); i++) { prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); prev_top_dims[i] = PyGpuArray_DIM(output, i); } #endif #endif #if CUDNN_VERSION > 3000 if (algo == CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT) { int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, upscale, &mode); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(*input, 2) > 1024 || PyGpuArray_DIM(*input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_0; } } #endif size_t worksize; gpudata *workspace; err = cudnnGetConvolutionBackwardDataWorkspaceSize( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(kerns), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(input), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); err = cudnnConvolutionBackwardData_v3( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(*input)); if (worksize != 0) c->ops->buffer_release(workspace); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*input)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
static int sgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, float alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, float beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; blas_handle *h; size_t *lt, t; gpudata **T; size_t i; const size_t threshold = 650; cb_transpose transT; if (batchCount == 0) return GA_NO_ERROR; if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; ASSERT_BUF(A[0]); ctx = A[0]->ctx; h = (blas_handle *)ctx->blas_handle; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } /* use parallel cublasSgemm calls rather than cublasSgemmBatched for * large products */ if (M * N * K > threshold * threshold * threshold) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); h->err = cublasSgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, (float*)A[i]->ptr + offA[i], lda, (float*)B[i]->ptr + offB[i], ldb, &beta, (float*)C[i]->ptr + offC[i], ldc); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL)); } } else { float **T_l = alloca(sizeof(float *) * batchCount * 3); const float **A_l = (const float **)T_l; const float **B_l = (const float **)T_l + batchCount; float **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C[i], CUDA_WAIT_ALL)); A_l[i] = ((float *)A[i]->ptr) + offA[i]; B_l[i] = ((float *)B[i]->ptr) + offB[i]; C_l[i] = ((float *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(float *) * batchCount * 3); Aa = Ta; Ba = Ta + (batchCount * sizeof(float *)); Ca = Ta + (batchCount * sizeof(float *) * 2); cuMemcpyHtoD(Ta, T_l, sizeof(float *) * batchCount * 3); h->err = cublasSgemmBatched(h->h, convT(transA), convT(transB), M, N, K, &alpha, (const float **)Aa, lda, (const float **)Ba, ldb, &beta, (float **)Ca, ldc, batchCount); cuMemFree(Ta); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } for (i = 0; i < batchCount; i++) { GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B[i], CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C[i], CUDA_WAIT_ALL)); } } cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemm(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata *A, size_t offA, size_t lda, gpudata *B, size_t offB, size_t ldb, double beta, gpudata *C, size_t offC, size_t ldc) { cuda_context *ctx = A->ctx; blas_handle *h = (blas_handle *)ctx->blas_handle; gpudata *T; size_t t; cb_transpose transT; ASSERT_BUF(A); ASSERT_BUF(B); ASSERT_BUF(C); if (LARGE_VAL(M) || LARGE_VAL(N) || LARGE_VAL(K) || LARGE_VAL(lda) || LARGE_VAL(ldb) || LARGE_VAL(ldc) || LARGE_VAL(M * N) || LARGE_VAL(M * K) || LARGE_VAL(K * N)) return GA_XLARGE_ERROR; if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; t = offA; offA = offB; offB = t; } cuda_enter(ctx); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_wait(C, CUDA_WAIT_ALL)); h->err = cublasDgemm(h->h, convT(transA), convT(transB), M, N, K, &alpha, ((double *)A->ptr) + offA, lda, ((double *)B->ptr) + offB, ldb, &beta, ((double *)C->ptr) + offC, ldc); if (h->err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (h->err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(A, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(B, CUDA_WAIT_READ)); GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(C, CUDA_WAIT_ALL)); cuda_exit(ctx); return GA_NO_ERROR; }
static int dgerBatch(cb_order order, size_t M, size_t N, double alpha, gpudata **x, size_t *offX, size_t incX, gpudata **y, size_t *offY, size_t incY, gpudata **A, size_t *offA, size_t lda, size_t batchCount, int flags) { cuda_context *ctx; size_t t, *tp, i; size_t ls[3] = {M, N, 1}, gs[3] = {1, 1, batchCount}; void *args[10]; gpudata **T; gpudata *Aa, *xa, *ya; int err; if (flags != 0) return GA_INVALID_ERROR; if (batchCount == 0) return GA_NO_ERROR; if (incX == 1) { if (ls[0] > 32) { gs[0] = (ls[0] + 31) / 32; ls[0] = 32; } if (ls[0] * ls[1] > 512) { gs[1] = (ls[1] + 15) / 16; ls[1] = 16; } } else { if (ls[1] > 32) { gs[1] = (ls[1] + 31) / 32; ls[1] = 32; } if (ls[0] * ls[1] > 512) { gs[0] = (ls[0] + 15) / 16; ls[0] = 16; } } if (gs[0] * gs[1] * gs[2] > 65535) { if (gs[0] * gs[1] > 65535) return GA_VALUE_ERROR; gs[2] = (65535 / (gs[0] * gs[1])); } if (order == cb_c) { t = M; M = N; N = t; tp = offX; offX = offY; offY = tp; t = incX; incX = incY; incY = t; T = x; x = y; y = T; } ASSERT_BUF(x[0]); ctx = x[0]->ctx; cuda_enter(ctx); { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **x_l = (const double **)T_l + batchCount; double **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); cuda_wait(A[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); cuda_wait(x[i], CUDA_WAIT_READ); cuda_wait(y[i], CUDA_WAIT_READ); A_l[i] = (double *)(A[i]->ptr + offA[i]); x_l[i] = (double *)(x[i]->ptr + offX[i]); y_l[i] = (double *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l, GA_BUFFER_INIT, &err); if (Aa == NULL) return err; xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l, GA_BUFFER_INIT, &err); if (xa == NULL) { cuda_ops.buffer_release(Aa); return err; } ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l, GA_BUFFER_INIT, &err); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return err; } } args[0] = xa; args[1] = &incX; args[2] = ya; args[3] = &incY; args[4] = α args[5] = Aa; args[6] = &lda; args[7] = &batchCount; args[8] = &M; args[9] = &N; err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args); cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(x[i], CUDA_WAIT_READ); cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } cuda_exit(ctx); return GA_NO_ERROR; }
int APPLY_SPECIFIC(conv_fwd)(PyGpuArrayObject *input, PyGpuArrayObject *kerns, PyGpuArrayObject *om, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **output, PARAMS_TYPE* params) { PyGpuContextObject *c = input->context; void *alpha_p; void *beta_p; float af = alpha, bf = beta; cudnnStatus_t err = CUDNN_STATUS_SUCCESS; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(kerns)[1] * params->num_groups) { PyErr_SetString(PyExc_ValueError, "images and kernel must have the same stack size"); return 1; } if ((PyGpuArray_DIMS(kerns)[0] % params->num_groups) != 0) { PyErr_SetString(PyExc_ValueError, "Number of filters must be divisible by number of groups"); return 1; } switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } if (params->inplace) { Py_XDECREF(*output); *output = om; Py_INCREF(*output); } else { if (theano_prep_output(output, PyGpuArray_NDIM(om), PyGpuArray_DIMS(om), om->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*output, om)) return 1; } if (PyGpuArray_DIMS(input)[0] == 0 || PyGpuArray_DIMS(kerns)[0] == 0 || PyGpuArray_DIMS(kerns)[1] == 0) { int err2 = GpuArray_memset(&(*output)->ga, 0); if (err2 != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "GpuDnnConv could not fill the output with zeros: %d", err2); return 1; } return 0; } if (c_set_tensor_for_conv(input, APPLY_SPECIFIC(input), params->num_groups) == -1) return 1; if (c_set_filter(kerns, APPLY_SPECIFIC(kerns), params->num_groups) == -1) return 1; if (c_set_tensor_for_conv(*output, APPLY_SPECIFIC(output), params->num_groups) == -1) return 1; size_t input_offset = PyGpuArray_STRIDE(input, 0) / params->num_groups; size_t kern_offset = PyGpuArray_STRIDE(kerns, 0) * PyGpuArray_DIM(kerns, 0) / params->num_groups; size_t output_offset = PyGpuArray_STRIDE(*output, 0) / params->num_groups; cudnnConvolutionFwdAlgo_t algo = params->conv_algo; #ifdef DEBUG char algorithm_name[128]; #endif cuda_enter(c->ctx); if (params->choose_algo) { if (!params->choose_once) { reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(kerns, i) == prev_kern_dims[i]); } } if (!reuse_algo) { size_t free; int err2 = gpucontext_property(c->ctx, GA_CTX_PROP_LARGEST_MEMBLOCK, &free); if (err2 != GA_NO_ERROR) { PyErr_Format(PyExc_RuntimeError, "Error when trying to find the " "memory information on the GPU"); cuda_exit(c->ctx); return 1; } // Guess 4Mb if the info is not available if (free == 0) free = 4 * 1024 * 1024; if (params->choose_time) { int count; cudnnConvolutionFwdAlgoPerf_t choice; gpudata *tmpmem; tmpmem = gpudata_alloc(c->ctx, free, NULL, 0, NULL); if (tmpmem == NULL) { PyErr_SetString(PyExc_MemoryError, "Could not allocate working GPU memory"); return -1; } // We don't sync the buffer as we don't care about the values. err = cudnnFindConvolutionForwardAlgorithmEx( params->handle, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(kerns), desc, APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(*output), 1, &count, &choice, *(void **)tmpmem, free); gpudata_release(tmpmem); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #ifdef DEBUG if (count == 0) { PyErr_SetString(PyExc_RuntimeError, "No best-timed conv fwd algorithm found"); return 1; } else if (choice.status != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting best-timed FWD algo: %s", cudnnGetErrorString(choice.status)); return 1; } // Else, count is necessarly 1 for current implementation. #endif } else { err = cudnnGetConvolutionForwardAlgorithm( params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } } prev_algo = algo; } else { algo = prev_algo; } #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; // NB: This is printed only when algorithm is chosen at runtime. if (reuse_algo) fprintf(stderr, "(reused %s)\n", algorithm_name); else fprintf(stderr, "(using %s)\n", algorithm_name); #endif if (params->choose_once) { reuse_algo = 1; } else { for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_kern_dims[i] = PyGpuArray_DIM(kerns, i); } } } /* Only these algos are supported for 3d conv with cuDNN >= V5.1. */ if (PyGpuArray_NDIM(input) == 5 && !(algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM || algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING)) { #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; fprintf(stderr, "(%s unsupported for 3D: fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } // Algo `small` does not work for a batch size > 2^16, with cuDNN >= V5.1. // Issue should be resolved for cuDNN > V6.0. if (cudnnGetVersion() < 6100 && algo == CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM && PyGpuArray_DIM(input, 0) > 65536) { #ifdef DEBUG fprintf(stderr, "(CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM " "will fail with batch size > 2^16, fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n"); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } // The FFT implementation does not support strides, 1x1 filters or inputs // with a spatial dimension larger than 1024. The tiled-FFT implementation // does not support strides. // If the chosen implementation is FFT or tiled-FFT, validate that it can // be used on the current data and default to a safe implementation if it // can't. // The following code is 2d-specific but it is fine as FFT and tiled-FFT are // defined only for 2d filters /* NB: TODO: These checkings seems outdated for FFT algorithms with cuDNN >= 5.1. New conditions apply and may depend on number of dimensions (2D or 3D) e.g. for FFT_TILING. TODO: More globally, how to handle CUDNN_STATUS_NOT_SUPPORTED with unsupported algorithms? */ if ((algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT || algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) && PyGpuArray_NDIM(input) == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int dilation[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor(desc, 2, &nd, pad, stride, dilation, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT) { if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(kerns, 2) == 1 && PyGpuArray_DIM(kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } else { // algo == CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING if (stride[0] != 1 || stride[1] != 1) { algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; } } } { size_t worksize; gpudata *workspace; err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); if (err == CUDNN_STATUS_NOT_SUPPORTED) { // Fallback to none algo if not supported #ifdef DEBUG if (0 != theano_enum_to_string_cudnnConvolutionFwdAlgo_t(algo, algorithm_name)) return 1; fprintf(stderr, "(%s error getting worksize: " "fallback to CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM)\n", algorithm_name); #endif algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM; err = cudnnGetConvolutionForwardWorkspaceSize(params->handle, APPLY_SPECIFIC(input), APPLY_SPECIFIC(kerns), desc, APPLY_SPECIFIC(output), algo, &worksize); } if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } /* * This is less than ideal since we need to free it after (which * introduces a synchronization point. But we don't have a module * to place a nice get_work_mem() function in. */ if (worksize != 0) { workspace = gpudata_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); for ( int g = 0; g < params->num_groups; g++) { err = cudnnConvolutionForward( params->handle, alpha_p, APPLY_SPECIFIC(input), ((char *)PyGpuArray_DEV_DATA(input)) + input_offset * g, APPLY_SPECIFIC(kerns), ((char *)PyGpuArray_DEV_DATA(kerns)) + kern_offset * g, desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(output), ((char *)PyGpuArray_DEV_DATA(*output)) + output_offset * g); } if (worksize != 0) gpudata_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(kerns->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*output)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); } cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
int APPLY_SPECIFIC(conv_gw)(PyGpuArrayObject *input, PyGpuArrayObject *output, PyGpuArrayObject *km, cudnnConvolutionDescriptor_t desc, double alpha, double beta, PyGpuArrayObject **kerns, PyGpuContextObject *c) { cudnnStatus_t err = CUDNN_STATUS_SUCCESS; float af = alpha, bf = beta; void *alpha_p; void *beta_p; if (PyGpuArray_DIMS(input)[1] != PyGpuArray_DIMS(km)[1]) { PyErr_SetString(PyExc_ValueError, "GpuDnnConv images and kernel must have the same stack size"); return 1; } if (c_set_tensorNd(input, APPLY_SPECIFIC(input)) == -1) return 1; if (c_set_tensorNd(output, APPLY_SPECIFIC(output)) == -1) return 1; switch (input->ga.typecode) { case GA_DOUBLE: alpha_p = (void *)α beta_p = (void *)β break; case GA_FLOAT: case GA_HALF: alpha_p = (void *)⁡ beta_p = (void *)&bf; break; default: PyErr_SetString(PyExc_TypeError, "Unsupported type in convolution"); return 1; } #ifdef CONV_INPLACE Py_XDECREF(*kerns); *kerns = km; Py_INCREF(*kerns); #else if (theano_prep_output(kerns, PyGpuArray_NDIM(km), PyGpuArray_DIMS(km), km->ga.typecode, GA_C_ORDER, c) != 0) return 1; if (beta != 0.0 && pygpu_move(*kerns, km)) return 1; #endif if (c_set_filter(*kerns, APPLY_SPECIFIC(kerns)) == -1) return 1; cudnnConvolutionBwdFilterAlgo_t algo = CONV_ALGO; cuda_enter(c->ctx); #ifdef CHOOSE_ALGO static int reuse_algo = 0; static cudnnConvolutionBwdFilterAlgo_t prev_algo = CONV_ALGO; #ifndef CHOOSE_ONCE static size_t prev_img_dims[5] = {0}; static size_t prev_top_dims[5] = {0}; reuse_algo = 1; for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { reuse_algo = (reuse_algo && PyGpuArray_DIM(input, i) == prev_img_dims[i]); reuse_algo = (reuse_algo && PyGpuArray_DIM(output, i) == prev_top_dims[i]); } #endif if (!reuse_algo) { #ifdef CHOOSE_TIME int count; cudnnConvolutionBwdFilterAlgoPerf_t choice; err = cudnnFindConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), 1, &count, &choice); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } algo = choice.algo; #else size_t free = 0, total = 0; cudaError_t err2 = cudaMemGetInfo(&free, &total); if (err2 != cudaSuccess){ cudaGetLastError(); PyErr_Format(PyExc_RuntimeError, "Error when trying to find the memory " "information on the GPU: %s\n", cudaGetErrorString(err2)); cuda_exit(c->ctx); return 1; } err = cudnnGetConvolutionBackwardFilterAlgorithm( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, free, &algo); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error selecting convolution algo: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } #endif prev_algo = algo; } else { algo = prev_algo; } #ifdef CHOOSE_ONCE reuse_algo = 1; #else for (unsigned int i = 0; i < PyGpuArray_NDIM(input); i++) { prev_img_dims[i] = PyGpuArray_DIM(input, i); prev_top_dims[i] = PyGpuArray_DIM(output, i); } #endif #endif // The FFT implementation does not support strides, 1x1 filters or inputs // with a spatial dimension larger than 1024. // If the chosen implementation is FFT, validate that it can // be used on the current data and default to a safe implementation if it // can't. // The following code is 2d-specific but it is fine as FFT and tiled-FFT are // defined only for 2d filters if (algo == CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT && PyGpuArray_NDIM(input) == 4) { // Extract the properties of the convolution descriptor int nd; int pad[2]; int stride[2]; int upscale[2]; cudnnConvolutionMode_t mode; cudnnDataType_t data_type; err = cudnnGetConvolutionNdDescriptor_v3(desc, 2, &nd, pad, stride, upscale, &mode, &data_type); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting convolution properties: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (stride[0] != 1 || stride[1] != 1 || PyGpuArray_DIM(input, 2) > 1024 || PyGpuArray_DIM(input, 3) > 1024 || (PyGpuArray_DIM(*kerns, 2) == 1 && PyGpuArray_DIM(*kerns, 3) == 1)) { algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0; } } size_t worksize; gpudata *workspace; err = cudnnGetConvolutionBackwardFilterWorkspaceSize( APPLY_SPECIFIC(_handle), APPLY_SPECIFIC(input), APPLY_SPECIFIC(output), desc, APPLY_SPECIFIC(kerns), algo, &worksize); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error getting worksize: %s", cudnnGetErrorString(err)); cuda_exit(c->ctx); return 1; } if (worksize != 0) { workspace = c->ops->buffer_alloc(c->ctx, worksize, NULL, 0, NULL); if (workspace == NULL) { PyErr_SetString(PyExc_RuntimeError, "Could not allocate working memory"); cuda_exit(c->ctx); return 1; } } cuda_wait(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_wait((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); err = cudnnConvolutionBackwardFilter_v3( APPLY_SPECIFIC(_handle), alpha_p, APPLY_SPECIFIC(input), PyGpuArray_DEV_DATA(input), APPLY_SPECIFIC(output), PyGpuArray_DEV_DATA(output), desc, algo, worksize == 0 ? NULL : *(void **)workspace, worksize, beta_p, APPLY_SPECIFIC(kerns), PyGpuArray_DEV_DATA(*kerns)); if (worksize != 0) c->ops->buffer_release(workspace); cuda_record(input->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record(output->ga.data, GPUARRAY_CUDA_WAIT_READ); cuda_record((*kerns)->ga.data, GPUARRAY_CUDA_WAIT_WRITE); cuda_exit(c->ctx); if (err != CUDNN_STATUS_SUCCESS) { PyErr_Format(PyExc_RuntimeError, "error doing operation: %s", cudnnGetErrorString(err)); return 1; } return 0; }
static int dgemvBatch(cb_order order, cb_transpose transA, size_t M, size_t N, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **x, size_t *offX, size_t incX, double beta, gpudata **y, size_t *offY, size_t incY, size_t batchCount, int flags) { cuda_context *ctx; size_t t, i; size_t ls[2], gs[2]; void *args[9]; gpudata *Aa, *xa, *ya; int err; if (flags != 0) return GA_INVALID_ERROR; if (batchCount == 0) return GA_NO_ERROR; if (alpha != 1.0 || beta != 1.0) return GA_UNSUPPORTED_ERROR; if (M < 512) { ls[0] = 32; if (batchCount > 16) ls[1] = 16; else ls[1] = batchCount; } else { ls[0] = 512; ls[1] = 1; } gs[0] = (M + ls[0] - 1) / ls[0]; gs[1] = (batchCount + ls[1] - 1) / ls[1]; if (gs[0] * gs[1] / 65535) { gs[1] = (65535 / gs[0]); } if (order == cb_c) { t = N; N = M; M = t; if (transA == cb_no_trans) { transA = cb_trans; } else { transA = cb_no_trans; } } ASSERT_BUF(A[0]); ctx = A[0]->ctx; cuda_enter(ctx); { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **x_l = (const double **)T_l + batchCount; double **y_l = T_l + (batchCount * 2); for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(x[i]); ASSERT_BUF(y[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(x[i], CUDA_WAIT_READ); cuda_wait(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); A_l[i] = (double *)(A[i]->ptr + offA[i]); x_l[i] = (double *)(x[i]->ptr + offX[i]); y_l[i] = (double *)(y[i]->ptr + offY[i]); } Aa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, A_l, GA_BUFFER_INIT, &err); if (Aa == NULL) return err; xa = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, x_l, GA_BUFFER_INIT, &err); if (xa == NULL) { cuda_ops.buffer_release(Aa); return err; } ya = cuda_ops.buffer_alloc(ctx, sizeof(double *) * batchCount, y_l, GA_BUFFER_INIT, &err); if (ya == NULL) { cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); return err; } } args[0] = Aa; args[1] = &lda; args[2] = xa; args[3] = &incX; args[4] = ya; args[5] = &incY; args[6] = &batchCount; args[7] = &M; args[8] = &N; if (transA == cb_no_trans) { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args); } else { err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args); } cuda_ops.buffer_release(Aa); cuda_ops.buffer_release(xa); cuda_ops.buffer_release(ya); if (err != GA_NO_ERROR) { cuda_exit(ctx); return err; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(x[i], CUDA_WAIT_READ); cuda_record(y[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } cuda_exit(ctx); return GA_NO_ERROR; }
static int dgemmBatch(cb_order order, cb_transpose transA, cb_transpose transB, size_t M, size_t N, size_t K, double alpha, gpudata **A, size_t *offA, size_t lda, gpudata **B, size_t *offB, size_t ldb, double beta, gpudata **C, size_t *offC, size_t ldc, size_t batchCount) { cuda_context *ctx; size_t *lt, t; gpudata **T; size_t i; cb_transpose transT; cublasStatus_t err; if (batchCount == 0) return GA_NO_ERROR; ASSERT_BUF(A[0]); ctx = A[0]->ctx; cuda_enter(ctx); if (order == cb_c) { /* swap A and B */ t = N; N = M; M = t; T = A; A = B; B = T; t = lda; lda = ldb; ldb = t; transT = transA; transA = transB; transB = transT; lt = offA; offA = offB; offB = lt; } // use parallel cublasSgemm calls rather than cublasSgemmBatched for large products const size_t threshold = 650; const int multiple_dispatch = M * N * K > threshold * threshold * threshold; if (multiple_dispatch) { for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); err = cublasDgemm(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (double*)A[i]->ptr + offA[i], lda, (double*)B[i]->ptr + offB[i], ldb, &beta, (double*)C[i]->ptr + offC[i], ldc); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } else { double **T_l = alloca(sizeof(double *) * batchCount * 3); const double **A_l = (const double **)T_l; const double **B_l = (const double **)T_l + batchCount; double **C_l = T_l + (batchCount * 2); CUdeviceptr Ta, Aa, Ba, Ca; for (i = 0; i < batchCount; i++) { ASSERT_BUF(A[i]); ASSERT_BUF(B[i]); ASSERT_BUF(C[i]); cuda_wait(A[i], CUDA_WAIT_READ); cuda_wait(B[i], CUDA_WAIT_READ); cuda_wait(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); A_l[i] = ((double *)A[i]->ptr) + offA[i]; B_l[i] = ((double *)B[i]->ptr) + offB[i]; C_l[i] = ((double *)C[i]->ptr) + offC[i]; } cuMemAlloc(&Ta, sizeof(double *) * batchCount * 3); Aa = Ta; Ba = Ta + (batchCount * sizeof(double *)); Ca = Ta + (batchCount * sizeof(double *) * 2); cuMemcpyHtoD(Ta, T_l, sizeof(double *) * batchCount * 3); err = cublasDgemmBatched(((blas_handle *)ctx->blas_handle)->h, convT(transA), convT(transB), M, N, K, &alpha, (const double **)Aa, lda, (const double **)Ba, ldb, &beta, (double **)Ca, ldc, batchCount); cuMemFree(Ta); if (err != CUBLAS_STATUS_SUCCESS) { cuda_exit(ctx); if (err == CUBLAS_STATUS_ARCH_MISMATCH) return GA_DEVSUP_ERROR; return GA_BLAS_ERROR; } for (i = 0; i < batchCount; i++) { cuda_record(A[i], CUDA_WAIT_READ); cuda_record(B[i], CUDA_WAIT_READ); cuda_record(C[i], CUDA_WAIT_READ|CUDA_WAIT_WRITE); } } cuda_exit(ctx); return GA_NO_ERROR; }