/* add a new VAS object into the device VAS list. */ static void __gdev_vas_list_add(struct gdev_vas *vas) { struct gdev_device *gdev = vas->gdev; unsigned long flags; gdev_lock_save(&gdev->vas_lock, &flags); gdev_list_add(&vas->list_entry, &gdev->vas_list); gdev_unlock_restore(&gdev->vas_lock, &flags); }
/** * Invokes the kernel f on a grid_width x grid_height grid of blocks. Each * block contains the number of threads specified by a previous call to * cuFuncSetBlockShape(). * * Parameters: * f - Kernel to launch * grid_width - Width of grid in blocks * grid_height - Height of grid in blocks * * Returns: * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, * CUDA_ERROR_LAUNCH_FAILED, CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, * CUDA_ERROR_LAUNCH_TIMEOUT, CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING */ CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height) { struct CUfunc_st *func = f; struct CUmod_st *mod = func->mod; struct CUctx_st *ctx = mod->ctx; struct gdev_kernel *k; struct gdev_cuda_fence *fence; Ghandle handle; if (!gdev_initialized) return CUDA_ERROR_NOT_INITIALIZED; if (!ctx || ctx != gdev_ctx_current) return CUDA_ERROR_INVALID_CONTEXT; if (!func || grid_width <= 0 || grid_height <= 0) return CUDA_ERROR_INVALID_VALUE; if (!(fence = (struct gdev_cuda_fence *)MALLOC(sizeof(*fence)))) return CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES; k = &func->kernel; k->grid_x = grid_width; k->grid_y = grid_height; k->grid_z = 1; k->grid_id = ++ctx->launch_id; k->smem_base = gdev_cuda_align_base(0); k->lmem_base = k->smem_base + gdev_cuda_align_base(k->smem_size); handle = gdev_ctx_current->gdev_handle; if (glaunch(handle, k, &fence->id)) return CUDA_ERROR_LAUNCH_FAILED; fence->addr_ref = 0; /* no address to unreference later. */ gdev_list_init(&fence->list_entry, fence); gdev_list_add(&fence->list_entry, &ctx->sync_list); return CUDA_SUCCESS; }
/** * Copies from device to host memory. dstHost and srcDevice specify the base * pointers of the destination and source, respectively. ByteCount specifies * the number of bytes to copy. * * cuMemcpyDtoHAsync() is asynchronous and can optionally be associated to a * stream by passing a non-zero hStream argument. It only works on page-locked * memory and returns an error if a pointer to pageable memory is passed as * input. * * Parameters: * dstHost - Destination host pointer * srcDevice - Source device pointer * ByteCount - Size of memory copy in bytes * hStream - Stream identifier * * Returns: * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE */ CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream) { CUresult res; struct CUctx_st *ctx; Ghandle handle, handle_r; struct CUstream_st *stream = hStream; void *dst_buf = dstHost; uint64_t src_addr = srcDevice; uint64_t src_addr_r, dst_addr_r, dst_addr; uint32_t size = ByteCount; struct gdev_cuda_fence *fence; uint32_t id; if (!stream) return cuMemcpyDtoH(dst_buf, src_addr, size); if (!gdev_initialized) return CUDA_ERROR_NOT_INITIALIZED; if (!dst_buf || !src_addr || !size) return CUDA_ERROR_INVALID_VALUE; res = cuCtxGetCurrent(&ctx); if (res != CUDA_SUCCESS) return res; if (ctx != stream->ctx) return CUDA_ERROR_INVALID_CONTEXT; fence = (struct gdev_cuda_fence *)MALLOC(sizeof(*fence)); if (!fence) return CUDA_ERROR_OUT_OF_MEMORY; /* this API shouldn't return it... */ handle = ctx->gdev_handle; handle_r = stream->gdev_handle; /* reference the device memory address. */ if (!(src_addr_r = gref(handle, src_addr, size, handle_r))) goto fail_gref; /* translate from buffer to address. */ if (!(dst_addr = gvirtget(handle, dst_buf))) goto fail_gvirtget; /* reference the host memory address. */ if (!(dst_addr_r = gref(handle, dst_addr, size, handle_r))) goto fail_gref_dma; /* now we can just copy data in the global address space. */ if (gmemcpy_async(handle_r, dst_addr_r, src_addr_r, size, &id)) goto fail_gmemcpy; fence->id = id; fence->addr_ref = src_addr_r; gdev_list_init(&fence->list_entry, fence); gdev_list_add(&fence->list_entry, &stream->sync_list); return CUDA_SUCCESS; fail_gmemcpy: gunref(handle_r, dst_addr_r); fail_gref_dma: fail_gvirtget: gunref(handle_r, src_addr_r); fail_gref: FREE(fence); return CUDA_ERROR_UNKNOWN; }