static int cuda_write(gpudata *dst, size_t dstoff, const void *src, size_t sz) { cuda_context *ctx = dst->ctx; ASSERT_BUF(dst); if (sz == 0) return GA_NO_ERROR; if ((dst->sz - dstoff) < sz) return GA_VALUE_ERROR; cuda_enter(ctx); if (dst->flags & CUDA_MAPPED_PTR) { ctx->err = cuEventSynchronize(dst->rev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } memcpy((void *)(dst->ptr + dstoff), src, sz); } else { cuda_waits(dst, CUDA_WAIT_WRITE, ctx->mem_s); ctx->err = cuMemcpyHtoDAsync(dst->ptr + dstoff, src, sz, ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_records(dst, CUDA_WAIT_WRITE, ctx->mem_s); } cuda_exit(ctx); return GA_NO_ERROR; }
CUresult TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha ) { CUresult status; CUdeviceptr dptrOut = 0; CUdeviceptr dptrIn = 0; float *hostOut = 0; float *hostIn = 0; CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) ); CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) ); CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) ); CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) ); CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) ); for ( size_t i = 0; i < N; i++ ) { hostIn[i] = (float) rand() / (float) RAND_MAX; } CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) ); { CUmodule moduleSAXPY; CUfunction kernelSAXPY; void *params[] = { &dptrOut, &dptrIn, &N, &alpha }; moduleSAXPY = chDevice->module( "saxpy.ptx" ); if ( ! moduleSAXPY ) { status = CUDA_ERROR_NOT_FOUND; goto Error; } CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) ); CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) ); } CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) ); CUDA_CHECK( cuCtxSynchronize() ); for ( size_t i = 0; i < N; i++ ) { if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) { status = CUDA_ERROR_UNKNOWN; goto Error; } } status = CUDA_SUCCESS; printf( "Well it worked!\n" ); Error: cuCtxPopCurrent( NULL ); cuMemFreeHost( hostOut ); cuMemFreeHost( hostIn ); cuMemFree( dptrOut ); cuMemFree( dptrIn ); return status; }
unsigned int Encode(unsigned char* pFrame, bool bKeyframe) { unsigned int nSize = 0; if (m_NvidiaNvencCodecContext.GetUseSwscaleInsteadOfCuda()) { uint8_t * pRgb32Planes[1] = { pFrame }; int pRgb32Linesizes[1] = { m_NvidiaNvencCodecContext.GetWidth() * 4 }; if (sws_scale(m_SwscaleContext, pRgb32Planes, pRgb32Linesizes, 0, m_NvidiaNvencCodecContext.GetHeight(), m_pNv12Planes, m_pNv12Strides) != m_NvidiaNvencCodecContext.GetHeight()) { throw std::runtime_error("sws_scale failed!"); } CHECK_CUDA_DRV_STATUS(cuMemcpyHtoD(m_pNv12Buffer, m_pPageLockedNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2)); } else { if (m_NvidiaNvencCodecContext.GetUsePageLockedIntermediateBuffer()) { memcpy(m_pPageLockedRgb32Buffer, pFrame, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4); CHECK_CUDA_DRV_STATUS(cuMemcpyHtoDAsync(m_pRgb32Buffer, m_pPageLockedRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4, m_Stream)); CudaRgba32ToNv12(m_Stream, m_pRgb32Buffer, m_pNv12Buffer, m_nNv12BufferPitch, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight()); } else { CHECK_CUDA_DRV_STATUS(cuMemcpyHtoD(m_pRgb32Buffer, pFrame, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4)); } cudaStreamSynchronize(m_Stream); } m_PictureParameters.encodePicFlags = bKeyframe ? NV_ENC_PIC_FLAG_FORCEIDR | NV_ENC_PIC_FLAG_OUTPUT_SPSPPS : 0; #ifdef ASYNCHRONOUS // Sanity check _ASSERT(WaitForSingleObject(m_PictureParameters.completionEvent, 0) == WAIT_TIMEOUT); #endif CHECK_NVENC_STATUS(m_FunctionList.nvEncEncodePicture(m_pEncoder, &m_PictureParameters)); #ifdef ASYNCHRONOUS DWORD nWaitResult = WaitForSingleObject(m_PictureParameters.completionEvent, INFINITE); // Sanity check _ASSERT(nWaitResult == WAIT_OBJECT_0); #endif NV_ENC_LOCK_BITSTREAM LockBitstream = { NV_ENC_LOCK_BITSTREAM_VER, 0 }; LockBitstream.sliceOffsets = NULL; LockBitstream.outputBitstream = m_PictureParameters.outputBitstream; CHECK_NVENC_STATUS(m_FunctionList.nvEncLockBitstream(m_pEncoder, &LockBitstream)); nSize = LockBitstream.bitstreamSizeInBytes; if (m_NvidiaNvencCodecContext.GetSaveOutputToFile()) { fwrite(LockBitstream.bitstreamBufferPtr, 1, LockBitstream.bitstreamSizeInBytes, m_pOutputFile); } CHECK_NVENC_STATUS(m_FunctionList.nvEncUnlockBitstream(m_pEncoder, LockBitstream.outputBitstream)); return nSize; }
void memory_t<CUDA>::asyncCopyFrom(const void *source, const uintptr_t bytes, const uintptr_t offset){ const CUstream &stream = *((CUstream*) dev->currentStream); const uintptr_t bytes_ = (bytes == 0) ? size : bytes; OCCA_CHECK((bytes_ + offset) <= size); if(!isTexture) OCCA_CUDA_CHECK("Memory: Asynchronous Copy From", cuMemcpyHtoDAsync(*((CUdeviceptr*) handle) + offset, source, bytes_, stream) ); else OCCA_CUDA_CHECK("Texture Memory: Asynchronous Copy From", cuMemcpyHtoAAsync(((CUDATextureData_t*) handle)->array, offset, source, bytes_, stream) ); }
/* // Getting feature map for the selected subimage in GPU // // API //int getFeatureMapsGPUStream(const int numStep, const int k, CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map, CUstream *streams) // INPUT // numStep // k // devs_img // streams // OUTPUT // devs_map // RESULT // Error status */ int getFeatureMapsGPUStream(const int numStep, const int k, CvLSVMFeatureMapGPU **devs_img, CvLSVMFeatureMapGPU **devs_map, CUstream *streams) { int sizeX, sizeY; int p, px; int height, width; int i, j; int *nearest; float *w, a_x, b_x; int size_r, size_alfa, size_nearest, size_w, size_map; CUresult res; CvLSVMFeatureMapGPU **devs_r, **devs_alfa; CUdeviceptr dev_nearest, dev_w; px = 3 * NUM_SECTOR; p = px; size_nearest = k; size_w = k * 2; devs_r = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * numStep); devs_alfa = (CvLSVMFeatureMapGPU **) malloc( sizeof(CvLSVMFeatureMapGPU*) * numStep); nearest = (int *) malloc(sizeof(int) * size_nearest); w = (float *) malloc(sizeof(float) * size_w); // initialize "nearest" and "w" for (i = 0; i < k / 2; i++) { nearest[i] = -1; }/*for(i = 0; i < k / 2; i++)*/ for (i = k / 2; i < k; i++) { nearest[i] = 1; }/*for(i = k / 2; i < k; i++)*/ for (j = 0; j < k / 2; j++) { b_x = k / 2 + j + 0.5f; a_x = k / 2 - j - 0.5f; w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x)); w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x)); }/*for(j = 0; j < k / 2; j++)*/ for (j = k / 2; j < k; j++) { a_x = j - k / 2 + 0.5f; b_x = -j + k / 2 - 0.5f + k; w[j * 2] = 1.0f / a_x * ((a_x * b_x) / (a_x + b_x)); w[j * 2 + 1] = 1.0f / b_x * ((a_x * b_x) / (a_x + b_x)); }/*for(j = k / 2; j < k; j++)*/ res = cuMemAlloc(&dev_nearest, sizeof(int) * size_nearest); CUDA_CHECK(res, "cuMemAlloc(dev_nearest)"); res = cuMemAlloc(&dev_w, sizeof(float) * size_w); CUDA_CHECK(res, "cuMemAlloc(dev_w)"); res = cuMemcpyHtoDAsync(dev_nearest, nearest, sizeof(int) * size_nearest, streams[numStep - 1]); res = cuMemcpyHtoDAsync(dev_w, w, sizeof(float) * size_w, streams[numStep - 1]); // allocate device memory for (i = 0; i < numStep; i++) { width = devs_img[i]->sizeX; height = devs_img[i]->sizeY; allocFeatureMapObjectGPU<float>(&devs_r[i], width, height, 1); allocFeatureMapObjectGPU<int>(&devs_alfa[i], width, height, 2); } // excute async for (i = 0; i < numStep; i++) { // initialize "map", "r" and "alfa" width = devs_img[i]->sizeX; height = devs_img[i]->sizeY; sizeX = width / k; sizeY = height / k; size_map = sizeX * sizeY * p; size_r = width * height; size_alfa = width * height * 2; // initilize device memory value of 0 res = cuMemsetD32Async(devs_map[i]->map, 0, size_map, streams[i]); CUDA_CHECK(res, "cuMemset(dev_map)"); res = cuMemsetD32Async(devs_r[i]->map, 0, size_r, streams[i]); CUDA_CHECK(res, "cuMemset(dev_r)"); res = cuMemsetD32Async(devs_alfa[i]->map, 0, size_alfa, streams[i]); CUDA_CHECK(res, "cuMemset(dev_alfa)"); // launch kernel calculateHistogramGPULaunch(k, devs_img[i], devs_r[i], devs_alfa[i], streams[i]); } for (i = 0; i < numStep; i++) { getFeatureMapsGPULaunch(k, devs_r[i], devs_alfa[i], &dev_nearest, &dev_w, devs_map[i], streams[i]); } // free device memory res = cuMemFree(dev_nearest); CUDA_CHECK(res, "cuMemFree(dev_nearest)"); res = cuMemFree(dev_w); CUDA_CHECK(res, "cuMemFree(dev_w)"); for (i = 0; i < numStep; i++) { freeFeatureMapObjectGPU(&devs_r[i]); freeFeatureMapObjectGPU(&devs_alfa[i]); } free(nearest); free(w); free(devs_r); free(devs_alfa); return LATENT_SVM_OK; }
int cuda_test_memcpy_async(unsigned int size) { int i; CUresult res; CUdevice dev; CUcontext ctx; CUstream stream; CUdeviceptr data_addr; unsigned int *in, *out; struct timeval tv; struct timeval tv_total_start, tv_total_end; unsigned long total; struct timeval tv_h2d_start, tv_h2d_end; float h2d; struct timeval tv_d2h_start, tv_d2h_end; float d2h; gettimeofday(&tv_total_start, NULL); res = cuInit(0); if (res != CUDA_SUCCESS) { printf("cuInit failed: res = %u\n", (unsigned int)res); return -1; } res = cuDeviceGet(&dev, 0); if (res != CUDA_SUCCESS) { printf("cuDeviceGet failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxCreate(&ctx, 0, dev); if (res != CUDA_SUCCESS) { printf("cuCtxCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamCreate(&stream, 0); if (res != CUDA_SUCCESS) { printf("cuStreamCreate failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAlloc(&data_addr, size); if (res != CUDA_SUCCESS) { printf("cuMemAlloc failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&in, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemAllocHost((void **)&out, size); if (res != CUDA_SUCCESS) { printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res); return -1; } for (i = 0; i < size / 4; i++) { in[i] = i+1; out[i] = 0; } gettimeofday(&tv_h2d_start, NULL); res = cuMemcpyHtoDAsync(data_addr, in, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_h2d_end, NULL); gettimeofday(&tv_d2h_start, NULL); res = cuMemcpyDtoHAsync(out, data_addr, size, stream); if (res != CUDA_SUCCESS) { printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamSynchronize(stream); if (res != CUDA_SUCCESS) { printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_d2h_end, NULL); for (i = 0; i < size / 4; i++) { if (in[i] != out[i]) { printf("in[%d] = %u, out[%d] = %u\n", i, in[i], i, out[i]); } } res = cuMemFreeHost(out); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFreeHost(in); if (res != CUDA_SUCCESS) { printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res); return -1; } res = cuMemFree(data_addr); if (res != CUDA_SUCCESS) { printf("cuMemFree failed: res = %u\n", (unsigned int)res); return -1; } res = cuStreamDestroy(stream); if (res != CUDA_SUCCESS) { printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res); return -1; } res = cuCtxDestroy(ctx); if (res != CUDA_SUCCESS) { printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res); return -1; } gettimeofday(&tv_total_end, NULL); tvsub(&tv_h2d_end, &tv_h2d_start, &tv); h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_d2h_end, &tv_d2h_start, &tv); d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0; tvsub(&tv_total_end, &tv_total_start, &tv); total = tv.tv_sec * 1000 + tv.tv_usec / 1000; printf("HtoD: %f\n", h2d); printf("DtoH: %f\n", d2h); return 0; end: return -1; }
static void * nvptx_host2dev (void *d, const void *h, size_t s) { CUresult r; CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) return 0; if (!d) GOMP_PLUGIN_fatal ("invalid device address"); r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); if (!pb) GOMP_PLUGIN_fatal ("invalid device address"); if (!h) GOMP_PLUGIN_fatal ("invalid host address"); if (d == h) GOMP_PLUGIN_fatal ("invalid host or device address"); if ((void *)(d + s) > (void *)(pb + ps)) GOMP_PLUGIN_fatal ("invalid size"); #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); event_gc (false); r = cuMemcpyHtoDAsync ((CUdeviceptr)d, h, s, nvthd->current_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyHtoDAsync error: %s", cuda_error (r)); r = cuEventRecord (*e, nvthd->current_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_MEM, e, (void *)h); } else #endif { r = cuMemcpyHtoD ((CUdeviceptr)d, h, s); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyHtoD error: %s", cuda_error (r)); } return 0; }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }