static void nvptx_wait_all (void) { CUresult r; struct ptx_stream *s; pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); /* Wait for active streams initiated by this thread (or by multiple threads) to complete. */ for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) { if (s->multithreaded || pthread_equal (s->host_thread, self)) { r = cuStreamQuery (s->stream); if (r == CUDA_SUCCESS) continue; else if (r != CUDA_ERROR_NOT_READY) GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); r = cuStreamSynchronize (s->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); event_gc (true); }
static int nvptx_async_test_all (void) { struct ptx_stream *s; pthread_t self = pthread_self (); struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); for (s = nvthd->ptx_dev->active_streams; s != NULL; s = s->next) { if ((s->multithreaded || pthread_equal (s->host_thread, self)) && cuStreamQuery (s->stream) == CUDA_ERROR_NOT_READY) { pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); return 0; } } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); event_gc (true); return 1; }
static int nvptx_async_test (int async) { CUresult r; struct ptx_stream *s; s = select_stream_for_async (async, pthread_self (), false, NULL); if (!s) GOMP_PLUGIN_fatal ("unknown async %d", async); r = cuStreamQuery (s->stream); if (r == CUDA_SUCCESS) { /* The oacc-parallel.c:goacc_wait function calls this hook to determine whether all work has completed on this stream, and if so omits the call to the wait hook. If that happens, event_gc might not get called (which prevents variables from getting unmapped and their associated device storage freed), so call it here. */ event_gc (true); return 1; } else if (r == CUDA_ERROR_NOT_READY) return 0; GOMP_PLUGIN_fatal ("cuStreamQuery error: %s", cuda_error (r)); return 0; }
NVENCSTATUS CNvEncoderLowLatency::ConvertYUVToNV12(CUdeviceptr dNV12devPtr, int dstPitch, unsigned char *yuv[3], int width, int height, int maxWidth, int maxHeight) { CCudaAutoLock cuLock(m_cuContext); // copy luma CUDA_MEMCPY2D copyParam; memset(©Param, 0, sizeof(copyParam)); copyParam.dstMemoryType = CU_MEMORYTYPE_DEVICE; copyParam.dstDevice = dNV12devPtr; copyParam.dstPitch = dstPitch; copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; copyParam.srcHost = yuv[0]; copyParam.srcPitch = width; copyParam.WidthInBytes = width; copyParam.Height = height; __cu(cuMemcpy2D(©Param)); // copy chroma __cu(cuMemcpyHtoD(m_ChromaDevPtr[0], yuv[1], width*height / 4)); __cu(cuMemcpyHtoD(m_ChromaDevPtr[1], yuv[2], width*height / 4)); #define BLOCK_X 32 #define BLOCK_Y 16 int chromaHeight = height / 2; int chromaWidth = width / 2; dim3 block(BLOCK_X, BLOCK_Y, 1); dim3 grid((chromaWidth + BLOCK_X - 1) / BLOCK_X, (chromaHeight + BLOCK_Y - 1) / BLOCK_Y, 1); #undef BLOCK_Y #undef BLOCK_X CUdeviceptr dNV12Chroma = (CUdeviceptr)((unsigned char*)dNV12devPtr + dstPitch*maxHeight); void *args[8] = { &m_ChromaDevPtr[0], &m_ChromaDevPtr[1], &dNV12Chroma, &chromaWidth, &chromaHeight, &chromaWidth, &chromaWidth, &dstPitch }; __cu(cuLaunchKernel(m_cuInterleaveUVFunction, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, NULL, args, NULL)); CUresult cuResult = cuStreamQuery(NULL); if (!((cuResult == CUDA_SUCCESS) || (cuResult == CUDA_ERROR_NOT_READY))) { return NV_ENC_ERR_GENERIC; } return NV_ENC_SUCCESS; }
CUresult CNvEncoderLowLatency::ScaleNV12Image(CUdeviceptr dInput, CUdeviceptr dOutput, int srcWidth, int srcPitch, int srcHeight, int dstWidth, int dstPitch, int dstHeight, int maxWidth, int maxHeight) { CCudaAutoLock cuLock(m_cuContext); CUDA_ARRAY_DESCRIPTOR desc; CUresult result; float left, right; float xOffset, yOffset, xScale, yScale; int srcLeft, srcTop, srcRight, srcBottom; int dstLeft, dstTop, dstRight, dstBottom; srcLeft = 0; srcTop = 0; srcRight = srcWidth; srcBottom = srcHeight; dstLeft = 0; dstTop = 0; dstRight = dstWidth; dstBottom = dstHeight; if ((!dInput) || (!dOutput)) { PRINTERR("NULL surface pointer!\n"); return CUDA_ERROR_INVALID_VALUE; } xScale = (float)(srcRight - srcLeft) / (float)(dstRight - dstLeft); xOffset = 0.5f*xScale - 0.5f; if (xOffset > 0.5f) xOffset = 0.5f; yScale = (float)(srcBottom - srcTop) / (float)(dstBottom - dstTop); yOffset = 0.5f*yScale - 0.5f; if (yOffset > 0.5f) yOffset = 0.5f; left = (float)srcLeft; right = (float)(srcRight - 1); xOffset += left; desc.NumChannels = 1; desc.Width = srcPitch / desc.NumChannels; desc.Height = srcBottom - srcTop; desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; result = cuTexRefSetFilterMode(m_texLuma2D, CU_TR_FILTER_MODE_LINEAR); if (result != CUDA_SUCCESS) { PRINTERR("cuTexRefSetFilterMode: %d\n", result); return result; } result = cuTexRefSetAddress2D(m_texLuma2D, &desc, dInput + srcTop*srcPitch, srcPitch); if (result != CUDA_SUCCESS) { PRINTERR("BindTexture2D(luma): %d\n", result); return result; } desc.NumChannels = 2; desc.Width = srcPitch / desc.NumChannels; desc.Height = (srcBottom - srcTop) >> 1; desc.Format = CU_AD_FORMAT_UNSIGNED_INT8; result = cuTexRefSetFilterMode(m_texChroma2D, CU_TR_FILTER_MODE_LINEAR); if (result != CUDA_SUCCESS) { PRINTERR("cuTexRefSetFilterMode: %d\n", result); return result; } result = cuTexRefSetAddress2D(m_texChroma2D, &desc, dInput + (maxHeight + srcTop/2)*srcPitch, srcPitch); if (result != CUDA_SUCCESS) { PRINTERR("BindTexture2D(chroma): %d\n", result); return result; } int dstUVOffset = maxHeight * srcPitch; float x_Offset = xOffset - dstLeft*xScale; float y_Offset = yOffset + 0.5f - dstTop*yScale; float xc_offset = xOffset - dstLeft*xScale*0.5f; float yc_offset = yOffset + 0.5f - dstTop*yScale*0.5f; void *args[13] = { &dOutput, &dstUVOffset, &dstWidth, &dstHeight, &dstPitch, &left, &right, &x_Offset, &y_Offset, &xc_offset, &yc_offset, &xScale, &yScale }; dim3 block(256, 1, 1); dim3 grid((dstRight + 255) >> 8, (dstBottom + 1) >> 1, 1); result = cuLaunchKernel(m_cuScaleNV12Function, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, NULL, args, NULL); if (result != CUDA_SUCCESS) { PRINTERR("cuLaunchKernel: %d\n", result); return result; } result = cuStreamQuery(NULL); if (!((result == CUDA_SUCCESS) || (result == CUDA_ERROR_NOT_READY))) { return CUDA_SUCCESS; } return result; }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }