Example #1
0
static int cuda_read(void *dst, gpudata *src, size_t srcoff, size_t sz) {
    cuda_context *ctx = src->ctx;

    ASSERT_BUF(src);

    if (sz == 0) return GA_NO_ERROR;

    if ((src->sz - srcoff) < sz)
        return GA_VALUE_ERROR;

    cuda_enter(ctx);

    if (src->flags & CUDA_MAPPED_PTR) {
      ctx->err = cuEventSynchronize(src->wev);
      if (ctx->err != CUDA_SUCCESS) {
        cuda_exit(ctx);
        return GA_IMPL_ERROR;
      }
      memcpy(dst, (void *)(src->ptr + srcoff), sz);
    } else {
      cuda_waits(src, CUDA_WAIT_READ, ctx->mem_s);

      ctx->err = cuMemcpyDtoHAsync(dst, src->ptr + srcoff, sz, ctx->mem_s);
      if (ctx->err != CUDA_SUCCESS) {
        cuda_exit(ctx);
        return GA_IMPL_ERROR;
      }
      cuda_records(src, CUDA_WAIT_READ, ctx->mem_s);
    }
    cuda_exit(ctx);
    return GA_NO_ERROR;
}
CUresult
TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha )
{
    CUresult status;
    CUdeviceptr dptrOut = 0;
    CUdeviceptr dptrIn = 0;
    float *hostOut = 0;
    float *hostIn = 0;

    CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) );

    CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) );
    CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) );
    CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) );
    for ( size_t i = 0; i < N; i++ ) {
        hostIn[i] = (float) rand() / (float) RAND_MAX;
    }
    CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) );

    {
        CUmodule moduleSAXPY;
        CUfunction kernelSAXPY;
        void *params[] = { &dptrOut, &dptrIn, &N, &alpha };
        
        moduleSAXPY = chDevice->module( "saxpy.ptx" );
        if ( ! moduleSAXPY ) {
            status = CUDA_ERROR_NOT_FOUND;
            goto Error;
        }
        CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) );

        CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) );

    }

    CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) );
    CUDA_CHECK( cuCtxSynchronize() );
    for ( size_t i = 0; i < N; i++ ) {
        if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) {
            status = CUDA_ERROR_UNKNOWN;
            goto Error;
        }
    }
    status = CUDA_SUCCESS;
    printf( "Well it worked!\n" );

Error:
    cuCtxPopCurrent( NULL );
    cuMemFreeHost( hostOut );
    cuMemFreeHost( hostIn );
    cuMemFree( dptrOut );
    cuMemFree( dptrIn );
    return status;
}
void GPUInterface::MemcpyDeviceToHost(void* dest,
                                      const GPUPtr src,
                                      size_t memSize) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tEntering GPUInterface::MemcpyDeviceToHost\n");
#endif

    SAFE_CUPP(cuMemcpyDtoHAsync(dest, src, memSize, cudaStreams[0]));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr, "\t\t\tLeaving  GPUInterface::MemcpyDeviceToHost\n");
#endif

}
Example #4
0
  void memory_t<CUDA>::asyncCopyTo(void *dest,
                                   const uintptr_t bytes,
                                   const uintptr_t offset){
    const CUstream &stream = *((CUstream*) dev->currentStream);
    const uintptr_t bytes_ = (bytes == 0) ? size : bytes;

    OCCA_CHECK((bytes_ + offset) <= size);

    if(!isTexture)
      OCCA_CUDA_CHECK("Memory: Asynchronous Copy To",
                      cuMemcpyDtoHAsync(dest, *((CUdeviceptr*) handle) + offset, bytes_, stream) );
    else
      OCCA_CUDA_CHECK("Texture Memory: Asynchronous Copy To",
                      cuMemcpyAtoHAsync(dest,((CUDATextureData_t*) handle)->array, offset, bytes_, stream) );
  }
Example #5
0
bool VideoDecoderCUDAPrivate::processDecodedData(CUVIDPARSERDISPINFO *cuviddisp, VideoFrame* outFrame) {
    int num_fields = cuviddisp->progressive_frame ? 1 : 2+cuviddisp->repeat_first_field;

    for (int active_field = 0; active_field < num_fields; ++active_field) {
        CUVIDPROCPARAMS proc_params;
        memset(&proc_params, 0, sizeof(CUVIDPROCPARAMS));
        proc_params.progressive_frame = cuviddisp->progressive_frame; //check user config
        proc_params.second_field = active_field == 1; //check user config
        proc_params.top_field_first = cuviddisp->top_field_first;
        proc_params.unpaired_field = cuviddisp->progressive_frame == 1;

        CUdeviceptr devptr;
        unsigned int pitch;
        cuvidCtxLock(vid_ctx_lock, 0);
        CUresult cuStatus = cuvidMapVideoFrame(dec, cuviddisp->picture_index, &devptr, &pitch, &proc_params);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuvidMapVideoFrame failed on index %d (%#x, %s)", cuviddisp->picture_index, cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
#define PAD_ALIGN(x,mask) ( (x + mask) & ~mask )
        //uint w = dec_create_info.ulWidth;//PAD_ALIGN(dec_create_info.ulWidth, 0x3F);
        uint h = dec_create_info.ulHeight;//PAD_ALIGN(dec_create_info.ulHeight, 0x0F); //?
#undef PAD_ALIGN
        int size = pitch*h*3/2;
        if (size > host_data_size && host_data) {
            cuMemFreeHost(host_data);
            host_data = 0;
            host_data_size = 0;
        }
        if (!host_data) {
            cuStatus = cuMemAllocHost((void**)&host_data, size);
            if (cuStatus != CUDA_SUCCESS) {
                qWarning("cuMemAllocHost failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
                cuvidUnmapVideoFrame(dec, devptr);
                cuvidCtxUnlock(vid_ctx_lock, 0);
                return false;
            }
            host_data_size = size;
        }
        if (!host_data) {
            qWarning("No valid staging memory!");
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuMemcpyDtoHAsync(host_data, devptr, size, stream);
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuMemcpyDtoHAsync failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
            cuvidUnmapVideoFrame(dec, devptr);
            cuvidCtxUnlock(vid_ctx_lock, 0);
            return false;
        }
        cuStatus = cuCtxSynchronize();
        if (cuStatus != CUDA_SUCCESS) {
            qWarning("cuCtxSynchronize failed (%#x, %s)", cuStatus, _cudaGetErrorEnum(cuStatus));
        }
        cuvidUnmapVideoFrame(dec, devptr);
        cuvidCtxUnlock(vid_ctx_lock, 0);
        //qDebug("mark not in use pic_index: %d", cuviddisp->picture_index);
        surface_in_use[cuviddisp->picture_index] = false;

        uchar *planes[] = {
            host_data,
            host_data + pitch * h
        };
        int pitches[] = { (int)pitch, (int)pitch };
        VideoFrame frame(codec_ctx->width, codec_ctx->height, VideoFormat::Format_NV12);
        frame.setBits(planes);
        frame.setBytesPerLine(pitches);
        //TODO: is clone required? may crash on clone, I should review clone()
        //frame = frame.clone();
        if (outFrame) {
            *outFrame = frame.clone();
        }
#if COPY_ON_DECODE
        frame_queue.put(frame.clone());
#endif
        //qDebug("frame queue size: %d", frame_queue.size());
    }
    return true;
}
/*
// Feature map reduction in GPU
// In each cell we reduce dimension of the feature vector
// according to original paper special procedure
//
// API
//int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by,
          CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps,
          CUstream *streams)
// INPUT
// numStep
// bx
// by
// devs_map_in
// streams
// OUTPUT
// feature_maps
// RESULT
// Error status
*/
int PCAFeatureMapsGPUStream(const int numStep, const int bx, const int by,
        CvLSVMFeatureMapGPU **devs_map_in, CvLSVMFeatureMap **feature_maps,
        CUstream *streams)
{

    int sizeX, sizeY, pp;
    int size_map_pca;
    int i;
    CUresult res;
    CvLSVMFeatureMapGPU **devs_map_pca;

    pp = NUM_SECTOR * 3 + 4;

    devs_map_pca = (CvLSVMFeatureMapGPU **) malloc(
            sizeof(CvLSVMFeatureMapGPU*) * (numStep));

    // allocate memory
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_in[i]->sizeX + 2 * bx;
        sizeY = devs_map_in[i]->sizeY + 2 * by;

        size_map_pca = sizeX * sizeY * pp;

        allocFeatureMapObject(&feature_maps[i], sizeX, sizeY, pp);
        allocFeatureMapObjectGPU<float>(&devs_map_pca[i], sizeX, sizeY, pp);
    }

    // exucute async
    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_pca[i]->sizeX;
        sizeY = devs_map_pca[i]->sizeY;
        size_map_pca = sizeX * sizeY * pp;

        // initilize device memory value of 0
        res = cuMemsetD32Async(devs_map_pca[i]->map, 0, size_map_pca,
                streams[i]);
        CUDA_CHECK(res, "cuMemset(dev_map_pca)");

        // launch kernel
        PCAFeatureMapsAddNullableBorderGPULaunch(devs_map_in[i],
                devs_map_pca[i], bx, by, streams[i]);
    }

    for (i = 0; i < numStep; i++)
    {
        sizeX = devs_map_pca[i]->sizeX;
        sizeY = devs_map_pca[i]->sizeY;
        size_map_pca = sizeX * sizeY * pp;

        // copy memory from device to host
        res = cuMemcpyDtoHAsync(feature_maps[i]->map, devs_map_pca[i]->map,
                sizeof(float) * size_map_pca, streams[i]);
        CUDA_CHECK(res, "cuMemcpyDtoH(dev_map_pca)");
    }

    // free device memory
    for (i = 0; i < numStep; i++)
    {
        freeFeatureMapObjectGPU(&devs_map_pca[i]);
    }

    free(devs_map_pca);

    return LATENT_SVM_OK;
}
Example #7
0
int cuda_test_memcpy_async(unsigned int size)
{
	int i;
	CUresult res;
	CUdevice dev;
	CUcontext ctx;
	CUstream stream;
	CUdeviceptr data_addr;
	unsigned int *in, *out;
	struct timeval tv;
	struct timeval tv_total_start, tv_total_end;
	unsigned long total;
	struct timeval tv_h2d_start, tv_h2d_end;
	float h2d;
	struct timeval tv_d2h_start, tv_d2h_end;
	float d2h;

	gettimeofday(&tv_total_start, NULL);

	res = cuInit(0);
	if (res != CUDA_SUCCESS) {
		printf("cuInit failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuDeviceGet(&dev, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuDeviceGet failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxCreate(&ctx, 0, dev);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamCreate(&stream, 0);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamCreate failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAlloc(&data_addr, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAlloc failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&in, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemAllocHost((void **)&out, size);
	if (res != CUDA_SUCCESS) {
		printf("cuMemAllocHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	for (i = 0; i < size / 4; i++) {
		in[i] = i+1;
		out[i] = 0;
	}

	gettimeofday(&tv_h2d_start, NULL);
	res = cuMemcpyHtoDAsync(data_addr, in, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyHtoDAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_h2d_end, NULL);

	gettimeofday(&tv_d2h_start, NULL);
	res = cuMemcpyDtoHAsync(out, data_addr, size, stream);
	if (res != CUDA_SUCCESS) {
		printf("cuMemcpyDtoHAsync failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	res = cuStreamSynchronize(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamSynchronize() failed: res = %u\n", (unsigned int)res);
		return -1;
	}
	gettimeofday(&tv_d2h_end, NULL);

	for (i = 0; i < size / 4; i++) {
		if (in[i] != out[i]) {
			printf("in[%d] = %u, out[%d] = %u\n",
				   i, in[i], i, out[i]);
		}
	}

	res = cuMemFreeHost(out);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(out) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFreeHost(in);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFreeHost(in) failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuMemFree(data_addr);
	if (res != CUDA_SUCCESS) {
		printf("cuMemFree failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuStreamDestroy(stream);
	if (res != CUDA_SUCCESS) {
		printf("cuStreamDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	res = cuCtxDestroy(ctx);
	if (res != CUDA_SUCCESS) {
		printf("cuCtxDestroy failed: res = %u\n", (unsigned int)res);
		return -1;
	}

	gettimeofday(&tv_total_end, NULL);

	tvsub(&tv_h2d_end, &tv_h2d_start, &tv);
	h2d = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_d2h_end, &tv_d2h_start, &tv);
	d2h = tv.tv_sec * 1000.0 + (float)tv.tv_usec / 1000.0;
	tvsub(&tv_total_end, &tv_total_start, &tv);
	total = tv.tv_sec * 1000 + tv.tv_usec / 1000;

	printf("HtoD: %f\n", h2d);
	printf("DtoH: %f\n", d2h);

	return 0;

end:

	return -1;
}
Example #8
0
static void *
nvptx_dev2host (void *h, const void *d, size_t s)
{
  CUresult r;
  CUdeviceptr pb;
  size_t ps;
  struct nvptx_thread *nvthd = nvptx_thread ();

  if (!s)
    return 0;

  if (!d)
    GOMP_PLUGIN_fatal ("invalid device address");

  r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d);
  if (r != CUDA_SUCCESS)
    GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r));

  if (!pb)
    GOMP_PLUGIN_fatal ("invalid device address");

  if (!h)
    GOMP_PLUGIN_fatal ("invalid host address");

  if (d == h)
    GOMP_PLUGIN_fatal ("invalid host or device address");

  if ((void *)(d + s) > (void *)(pb + ps))
    GOMP_PLUGIN_fatal ("invalid size");

#ifndef DISABLE_ASYNC
  if (nvthd->current_stream != nvthd->ptx_dev->null_stream)
    {
      CUevent *e;

      e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent));

      r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r));

      event_gc (false);

      r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s,
			     nvthd->current_stream->stream);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r));

      r = cuEventRecord (*e, nvthd->current_stream->stream);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r));

      event_add (PTX_EVT_MEM, e, (void *)h);
    }
  else
#endif
    {
      r = cuMemcpyDtoH (h, (CUdeviceptr)d, s);
      if (r != CUDA_SUCCESS)
	GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r));
    }

  return 0;
}
Example #9
0
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq)
{
	VirtQueueElement elem;
	
	while(virtqueue_pop(vq, &elem)) {
		struct param *p = elem.out_sg[0].iov_base;
	
		//for all library routines: get required arguments from buffer, execute, and push results back in virtqueue
		switch (p->syscall_type) {
		case CUINIT: {
			p->result = cuInit(p->flags);
			break;
		}
		case CUDRIVERGETVERSION: {
			p->result = cuDriverGetVersion(&p->val1);
			break;
		}
		case CUDEVICEGETCOUNT: {
			p->result = cuDeviceGetCount(&p->val1);
			break;
		}
		case CUDEVICEGET: {
			p->result = cuDeviceGet(&p->device, p->val1);
			break;
		}
		case CUDEVICECOMPUTECAPABILITY: {
			p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device);
			break;
		}
		case CUDEVICEGETNAME: {
			p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device);
			break;
		}
		case CUDEVICEGETATTRIBUTE: {
			p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device);
			break;
		}
		case CUCTXCREATE: {
                        p->result = cuCtxCreate(&p->ctx, p->flags, p->device);				
			break;
		}
		case CUCTXDESTROY: {
			p->result = cuCtxDestroy(p->ctx);
			break;
		}
		case CUCTXGETCURRENT: {
			p->result = cuCtxGetCurrent(&p->ctx);
			break;
		}
		case CUCTXGETDEVICE: {
			p->result = cuCtxGetDevice(&p->device);
			break;
		}
		case CUCTXPOPCURRENT: {
			p->result = cuCtxPopCurrent(&p->ctx);
			break;
		}
		case CUCTXSETCURRENT: {
			p->result = cuCtxSetCurrent(p->ctx);
	                break;
		}
	        case CUCTXSYNCHRONIZE: {
		        p->result = cuCtxSynchronize();
	                break;
	        }
		case CUMODULELOAD: {
			//hardcoded path - needs improvement
			//all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS
			char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char));
			if (!binname) {
				p->result = 0;
		                virtqueue_push(vq, &elem, 0);
				break;
			}
		        strcpy(binname, getenv("QEMU_NFS_PATH"));
		        strcat(binname, (char *)elem.out_sg[1].iov_base);
			//change current CUDA context
			//each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes
                        if (cuCtxSetCurrent(p->ctx) != 0) {
				p->result = 999;
                                break;
			}
			p->result = cuModuleLoad(&p->module, binname);
			free(binname);
			break;
		}
                case CUMODULEGETGLOBAL: {
                        char *name = malloc(100*sizeof(char));
                        if (!name) {
                                p->result = 999;
                                break;
                        }
                        strcpy(name, (char *)elem.out_sg[1].iov_base);
                        p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name);
                        break;
                }
		case CUMODULEUNLOAD: {
			p->result = cuModuleUnload(p->module);
			break;			
		}
		case CUMEMALLOC: {
			if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAlloc(&p->dptr, p->bytesize);
			break;
		}
                case CUMEMALLOCPITCH: {
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize);
			break;
		}
		//large buffers are alocated in smaller chuncks in guest kernel space
		//gets each chunck seperately and copies it to device memory
	        case CUMEMCPYHTOD: {
			int i;
			size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.out_sg[1+2*i+1].iov_base;
				p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s);
				if (p->result != 0) break;
				offset += s;
			}
	                break;
		}
		case CUMEMCPYHTODASYNC: {
			int i;
                        size_t offset;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.out_sg[1+2*i+1].iov_base;
                                p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
                        break;
		}
		case CUMEMCPYDTODASYNC: {
			p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream);
                        break;		
		}
	        case CUMEMCPYDTOH: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
				s = *(long *)elem.in_sg[0+2*i+1].iov_base;
				p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s);
				if (p->result != 0) break;
				offset += s;
			}
			break;
		}
		case CUMEMCPYDTOHASYNC: {
			int i;
                        unsigned long s, nr_pages = p->nr_pages;
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
                        size_t offset = 0;
			for (i=0; i<nr_pages; i++) {
                                s = *(long *)elem.in_sg[0+2*i+1].iov_base;
                                p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream);
                                if (p->result != 0) break;
                                offset += s;
                        }
			break;
		}
		case CUMEMSETD32: {
			p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize);
			break;
		}
	        case CUMEMFREE: {
	                p->result = cuMemFree(p->dptr);
	                break;
	        }
		case CUMODULEGETFUNCTION: {
			char *name = (char *)elem.out_sg[1].iov_base;
			name[p->length] = '\0';
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuModuleGetFunction(&p->function, p->module, name);
			break;	
		}
		case CULAUNCHKERNEL: {
			void **args = malloc(p->val1*sizeof(void *));
	                if (!args) {
				p->result = 9999;
	                        break;
        	        }
			int i;
			for (i=0; i<p->val1; i++) {
				args[i] = elem.out_sg[1+i].iov_base;
			}
                        if (cuCtxSetCurrent(p->ctx) != 0) {
                                p->result = 999;
                                break;
                        }
			p->result = cuLaunchKernel(p->function,
					p->gridDimX, p->gridDimY, p->gridDimZ,
			                p->blockDimX, p->blockDimY, p->blockDimZ,
					p->bytecount, 0, args, 0);
			free(args);
			break;
		}
		case CUEVENTCREATE: {
			p->result = cuEventCreate(&p->event1, p->flags);
			break;
		}
		case CUEVENTDESTROY: {
			p->result = cuEventDestroy(p->event1);
			break;
		}
		case CUEVENTRECORD: {
			p->result = cuEventRecord(p->event1, p->stream);
			break;
		}
		case CUEVENTSYNCHRONIZE: {
			p->result = cuEventSynchronize(p->event1);
			break;
		}
		case CUEVENTELAPSEDTIME: {
			p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2);
			break;
		}
		case CUSTREAMCREATE: {
			p->result =  cuStreamCreate(&p->stream, 0);
			break;
		}		
                case CUSTREAMSYNCHRONIZE: {
                        p->result = cuStreamSynchronize(p->stream);
                        break;
                }
                case CUSTREAMQUERY: {
                        p->result = cuStreamQuery(p->stream);
                        break;
                }
		case CUSTREAMDESTROY: {
                        p->result = cuStreamDestroy(p->stream);
                        break;
                }

		default: 
			printf("Unknown syscall_type\n");
		}
		virtqueue_push(vq, &elem, 0);
	}
	//notify frontend - trigger virtual interrupt
	virtio_notify(vdev, vq);
	return;
}
Example #10
0
// Run the Cuda part of the computation
bool copyDecodedFrameToTexture(unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive)
{
    CUVIDPARSERDISPINFO oDisplayInfo;

    if (g_pFrameQueue->dequeue(&oDisplayInfo))
    {
        CCtxAutoLock lck(g_CtxLock);
        // Push the current CUDA context (only if we are using CUDA decoding path)
        CUresult result = cuCtxPushCurrent(g_oContext);

        CUdeviceptr  pDecodedFrame[2] = { 0, 0 };
        CUdeviceptr  pInteropFrame[2] = { 0, 0 };

        int num_fields = (oDisplayInfo.progressive_frame ? (1) : (2+oDisplayInfo.repeat_first_field));
        *pbIsProgressive = oDisplayInfo.progressive_frame;
        g_bIsProgressive = oDisplayInfo.progressive_frame ? true : false;

        for (int active_field=0; active_field<num_fields; active_field++)
        {
            nRepeats = oDisplayInfo.repeat_first_field;
            CUVIDPROCPARAMS oVideoProcessingParameters;
            memset(&oVideoProcessingParameters, 0, sizeof(CUVIDPROCPARAMS));

            oVideoProcessingParameters.progressive_frame = oDisplayInfo.progressive_frame;
            oVideoProcessingParameters.second_field      = active_field;
            oVideoProcessingParameters.top_field_first   = oDisplayInfo.top_field_first;
            oVideoProcessingParameters.unpaired_field    = (num_fields == 1);

            unsigned int nDecodedPitch = 0;
            unsigned int nWidth = 0;
            unsigned int nHeight = 0;

            // map decoded video frame to CUDA surfae
            g_pVideoDecoder->mapFrame(oDisplayInfo.picture_index, &pDecodedFrame[active_field], &nDecodedPitch, &oVideoProcessingParameters);
            nWidth  = g_pVideoDecoder->targetWidth();
            nHeight = g_pVideoDecoder->targetHeight();
            // map DirectX texture to CUDA surface
            size_t nTexturePitch = 0;

            // If we are Encoding and this is the 1st Frame, we make sure we allocate system memory for readbacks
            if (g_bReadback && g_bFirstFrame && g_ReadbackSID)
            {
                CUresult result;
                checkCudaErrors(result = cuMemAllocHost((void **)&g_bFrameData[0], (nDecodedPitch * nHeight * 3 / 2)));
                checkCudaErrors(result = cuMemAllocHost((void **)&g_bFrameData[1], (nDecodedPitch * nHeight * 3 / 2)));
                g_bFirstFrame = false;

                if (result != CUDA_SUCCESS)
                {
                    printf("cuMemAllocHost returned %d\n", (int)result);
                }
            }

            // If streams are enabled, we can perform the readback to the host while the kernel is executing
            if (g_bReadback && g_ReadbackSID)
            {
                CUresult result = cuMemcpyDtoHAsync(g_bFrameData[active_field], pDecodedFrame[active_field], (nDecodedPitch * nHeight * 3 / 2), g_ReadbackSID);

                if (result != CUDA_SUCCESS)
                {
                    printf("cuMemAllocHost returned %d\n", (int)result);
                }
            }

#if ENABLE_DEBUG_OUT
            printf("%s = %02d, PicIndex = %02d, OutputPTS = %08d\n",
                   (oDisplayInfo.progressive_frame ? "Frame" : "Field"),
                   g_DecodeFrameCount, oDisplayInfo.picture_index, oDisplayInfo.timestamp);
#endif

            if (g_pImageDX)
            {
                // map the texture surface
                g_pImageDX->map(&pInteropFrame[active_field], &nTexturePitch, active_field);
            }
            else
            {
                pInteropFrame[active_field] = g_pInteropFrame[active_field];
                nTexturePitch = g_pVideoDecoder->targetWidth() * 2;
            }

            // perform post processing on the CUDA surface (performs colors space conversion and post processing)
            // comment this out if we inclue the line of code seen above
            cudaPostProcessFrame(&pDecodedFrame[active_field], nDecodedPitch, &pInteropFrame[active_field], nTexturePitch, g_pCudaModule->getModule(), gfpNV12toARGB, g_KernelSID);

            if (g_pImageDX)
            {
                // unmap the texture surface
                g_pImageDX->unmap(active_field);
            }

            // unmap video frame
            // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding)
            g_pVideoDecoder->unmapFrame(pDecodedFrame[active_field]);
            // release the frame, so it can be re-used in decoder
            g_pFrameQueue->releaseFrame(&oDisplayInfo);
            g_DecodeFrameCount++;
        }

        // Detach from the Current thread
        checkCudaErrors(cuCtxPopCurrent(NULL));
    }
    else
    {
        return false;
    }

    // check if decoding has come to an end.
    // if yes, signal the app to shut down.
    if (!g_pVideoSource->isStarted() || g_pFrameQueue->isEndOfDecode())
    {
        // Let's free the Frame Data
        if (g_ReadbackSID && g_bFrameData)
        {
            cuMemFreeHost((void *)g_bFrameData[0]);
            cuMemFreeHost((void *)g_bFrameData[1]);
            g_bFrameData[0] = NULL;
            g_bFrameData[1] = NULL;
        }

        // Let's just stop, and allow the user to quit, so they can at least see the results
        g_pVideoSource->stop();

        // If we want to loop reload the video file and restart
        if (g_bLoop && !g_bAutoQuit)
        {
            reinitCudaResources();
            g_FrameCount = 0;
            g_DecodeFrameCount = 0;
            g_pVideoSource->start();
        }

        if (g_bAutoQuit)
        {
            g_bDone = true;
        }
    }

    return true;
}