/* allocate a new virual address space (VAS) object. */ struct gdev_vas *gdev_vas_new(struct gdev_device *gdev, uint64_t size, void *handle) { struct gdev_vas *vas; if (!(vas = gdev_raw_vas_new(gdev, size))) { return NULL; } vas->handle = handle; vas->gdev = gdev; vas->prio = GDEV_PRIO_DEFAULT; gdev_list_init(&vas->list_entry, (void *) vas); /* entry to VAS list. */ gdev_list_init(&vas->mem_list, NULL); /* device memory list. */ gdev_list_init(&vas->dma_mem_list, NULL); /* host dma memory list. */ gdev_lock_init(&vas->lock); __gdev_vas_list_add(vas); return vas; }
void __gdev_init_device(struct gdev_device *gdev, int id) { gdev->id = id; gdev->users = 0; gdev->accessed = 0; gdev->blocked = 0; gdev->mem_size = 0; gdev->mem_used = 0; gdev->dma_mem_size = 0; gdev->dma_mem_used = 0; gdev->chipset = 0; gdev->com_bw = 100; gdev->mem_bw = 100; gdev->mem_sh = 100; gdev->com_bw_used = 0; gdev->mem_bw_used = 0; gdev->period = 0; gdev->com_time = 0; gdev->mem_time = 0; gdev->swap = NULL; gdev->sched_com_thread = NULL; gdev->sched_mem_thread = NULL; gdev->credit_com_thread = NULL; gdev->credit_mem_thread = NULL; gdev->current_com = NULL; gdev->current_mem = NULL; gdev->parent = NULL; gdev->priv = NULL; gdev_time_us(&gdev->credit_com, 0); gdev_time_us(&gdev->credit_mem, 0); gdev_list_init(&gdev->sched_com_list, NULL); gdev_list_init(&gdev->sched_mem_list, NULL); gdev_list_init(&gdev->vas_list, NULL); gdev_list_init(&gdev->shm_list, NULL); gdev_lock_init(&gdev->sched_com_lock); gdev_lock_init(&gdev->sched_mem_lock); gdev_lock_init(&gdev->vas_lock); gdev_lock_init(&gdev->global_lock); gdev_mutex_init(&gdev->shm_mutex); }
/** * Invokes the kernel f on a grid_width x grid_height grid of blocks. Each * block contains the number of threads specified by a previous call to * cuFuncSetBlockShape(). * * Parameters: * f - Kernel to launch * grid_width - Width of grid in blocks * grid_height - Height of grid in blocks * * Returns: * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE, * CUDA_ERROR_LAUNCH_FAILED, CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, * CUDA_ERROR_LAUNCH_TIMEOUT, CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING */ CUresult cuLaunchGrid(CUfunction f, int grid_width, int grid_height) { struct CUfunc_st *func = f; struct CUmod_st *mod = func->mod; struct CUctx_st *ctx = mod->ctx; struct gdev_kernel *k; struct gdev_cuda_fence *fence; Ghandle handle; if (!gdev_initialized) return CUDA_ERROR_NOT_INITIALIZED; if (!ctx || ctx != gdev_ctx_current) return CUDA_ERROR_INVALID_CONTEXT; if (!func || grid_width <= 0 || grid_height <= 0) return CUDA_ERROR_INVALID_VALUE; if (!(fence = (struct gdev_cuda_fence *)MALLOC(sizeof(*fence)))) return CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES; k = &func->kernel; k->grid_x = grid_width; k->grid_y = grid_height; k->grid_z = 1; k->grid_id = ++ctx->launch_id; k->smem_base = gdev_cuda_align_base(0); k->lmem_base = k->smem_base + gdev_cuda_align_base(k->smem_size); handle = gdev_ctx_current->gdev_handle; if (glaunch(handle, k, &fence->id)) return CUDA_ERROR_LAUNCH_FAILED; fence->addr_ref = 0; /* no address to unreference later. */ gdev_list_init(&fence->list_entry, fence); gdev_list_add(&fence->list_entry, &ctx->sync_list); return CUDA_SUCCESS; }
/** * Copies from device to host memory. dstHost and srcDevice specify the base * pointers of the destination and source, respectively. ByteCount specifies * the number of bytes to copy. * * cuMemcpyDtoHAsync() is asynchronous and can optionally be associated to a * stream by passing a non-zero hStream argument. It only works on page-locked * memory and returns an error if a pointer to pageable memory is passed as * input. * * Parameters: * dstHost - Destination host pointer * srcDevice - Source device pointer * ByteCount - Size of memory copy in bytes * hStream - Stream identifier * * Returns: * CUDA_SUCCESS, CUDA_ERROR_DEINITIALIZED, CUDA_ERROR_NOT_INITIALIZED, * CUDA_ERROR_INVALID_CONTEXT, CUDA_ERROR_INVALID_VALUE */ CUresult cuMemcpyDtoHAsync_v2(void *dstHost, CUdeviceptr srcDevice, unsigned int ByteCount, CUstream hStream) { CUresult res; struct CUctx_st *ctx; Ghandle handle, handle_r; struct CUstream_st *stream = hStream; void *dst_buf = dstHost; uint64_t src_addr = srcDevice; uint64_t src_addr_r, dst_addr_r, dst_addr; uint32_t size = ByteCount; struct gdev_cuda_fence *fence; uint32_t id; if (!stream) return cuMemcpyDtoH(dst_buf, src_addr, size); if (!gdev_initialized) return CUDA_ERROR_NOT_INITIALIZED; if (!dst_buf || !src_addr || !size) return CUDA_ERROR_INVALID_VALUE; res = cuCtxGetCurrent(&ctx); if (res != CUDA_SUCCESS) return res; if (ctx != stream->ctx) return CUDA_ERROR_INVALID_CONTEXT; fence = (struct gdev_cuda_fence *)MALLOC(sizeof(*fence)); if (!fence) return CUDA_ERROR_OUT_OF_MEMORY; /* this API shouldn't return it... */ handle = ctx->gdev_handle; handle_r = stream->gdev_handle; /* reference the device memory address. */ if (!(src_addr_r = gref(handle, src_addr, size, handle_r))) goto fail_gref; /* translate from buffer to address. */ if (!(dst_addr = gvirtget(handle, dst_buf))) goto fail_gvirtget; /* reference the host memory address. */ if (!(dst_addr_r = gref(handle, dst_addr, size, handle_r))) goto fail_gref_dma; /* now we can just copy data in the global address space. */ if (gmemcpy_async(handle_r, dst_addr_r, src_addr_r, size, &id)) goto fail_gmemcpy; fence->id = id; fence->addr_ref = src_addr_r; gdev_list_init(&fence->list_entry, fence); gdev_list_add(&fence->list_entry, &stream->sync_list); return CUDA_SUCCESS; fail_gmemcpy: gunref(handle_r, dst_addr_r); fail_gref_dma: fail_gvirtget: gunref(handle_r, src_addr_r); fail_gref: FREE(fence); return CUDA_ERROR_UNKNOWN; }