static gpudata *cuda_transfer(gpudata *src, size_t offset, size_t sz, void *dst_c, int may_share) { cuda_context *ctx = src->ctx; cuda_context *dst_ctx = (cuda_context *)dst_c; gpudata *dst; ASSERT_BUF(src); ASSERT_CTX(ctx); ASSERT_CTX(dst_ctx); if (ctx == dst_ctx) { if (may_share && offset == 0) { cuda_retain(src); return src; } dst = cuda_alloc(ctx, sz, NULL, 0, NULL); if (dst == NULL) return NULL; cuda_enter(ctx); cuda_wait(src, CUDA_WAIT_READ); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemcpyDtoDAsync(dst->ptr, src->ptr+offset, sz, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); cuda_free(dst); return NULL; } cuda_record(src, CUDA_WAIT_READ); cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return dst; } dst = cuda_alloc(dst_ctx, sz, NULL, 0, NULL); if (dst == NULL) return NULL; cuda_enter(ctx); cuda_waits(src, CUDA_WAIT_READ, dst_ctx->mem_s); cuda_waits(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s); ctx->err = cuMemcpyPeerAsync(dst->ptr, dst->ctx->ctx, src->ptr+offset, src->ctx->ctx, sz, dst_ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_free(dst); cuda_exit(ctx); return NULL; } cuda_records(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s); cuda_records(src, CUDA_WAIT_READ, dst_ctx->mem_s); cuda_exit(ctx); return dst; }
SEXP R_auto_cuMemcpyPeerAsync(SEXP r_dstDevice, SEXP r_dstContext, SEXP r_srcDevice, SEXP r_srcContext, SEXP r_ByteCount, SEXP r_hStream) { SEXP r_ans = R_NilValue; CUdeviceptr dstDevice = REAL(r_dstDevice)[0]; CUcontext dstContext = (CUcontext) getRReference(r_dstContext); CUdeviceptr srcDevice = REAL(r_srcDevice)[0]; CUcontext srcContext = (CUcontext) getRReference(r_srcContext); size_t ByteCount = REAL(r_ByteCount)[0]; CUstream hStream = (CUstream) getRReference(r_hStream); CUresult ans; ans = cuMemcpyPeerAsync(dstDevice, dstContext, srcDevice, srcContext, ByteCount, hStream); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }