コード例 #1
0
ファイル: SurfaceInteropCUDA.cpp プロジェクト: ntibor22/QtAV
bool GLInteropResource::ensureResource(int w, int h, int H, GLuint tex, int plane)
{
    Q_ASSERT(plane < 2 && "plane number must be 0 or 1 for NV12");
    TexRes &r = res[plane];
    if (r.texture == tex && r.w == w && r.h == h && r.H == H && r.cuRes)
        return true;
    if (!ctx) {
        // TODO: how to use pop/push decoder's context without the context in opengl context
        CUDA_ENSURE(cuCtxCreate(&ctx, CU_CTX_SCHED_BLOCKING_SYNC, dev), false);
        if (USE_STREAM) {
            CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT));
            CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT));
        }
        qDebug("cuda contex on gl thread: %p", ctx);
        CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this
    }
    if (r.cuRes) {
        CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false);
        r.cuRes = NULL;
    }
    // CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD works too for opengl, but not d3d
    CUDA_ENSURE(cuGraphicsGLRegisterImage(&r.cuRes, tex, GL_TEXTURE_2D, CU_GRAPHICS_REGISTER_FLAGS_NONE), false);
    r.texture = tex;
    r.w = w;
    r.h = h;
    r.H = H;
    return true;
}
コード例 #2
0
    void *get_read_ptr_host(ComputeEnv *env, size_t read_byte_size) {
        if (host_valid) {
            return host_ptr;
        }

        if (host_ptr == nullptr) {
            host_ptr = _mm_malloc(byte_size, 64);
        }

        if (last_write.type == Processor::OpenCL) {
            OpenCLDev *dev = &env->cl_dev_list[last_write.devid];
            clEnqueueReadBuffer(dev->queue, cl_ptr_list[last_write.devid],
                                CL_TRUE, 0, read_byte_size, host_ptr, 0, nullptr, nullptr);
        } else if (last_write.type == Processor::CUDA) {
            CUDADev *dev = &env->cuda_dev_list[last_write.devid];
            cuCtxPushCurrent(dev->context);
            //double t0 = getsec();
            cuMemcpyDtoH(host_ptr, cuda_ptr_list[last_write.devid], read_byte_size);
            //double t1 = getsec();
            //env->transfer_wait = t1-t0;

            CUcontext old;
            cuCtxPopCurrent(&old);
        } else {
            abort();
        }

        host_valid = true;
        return host_ptr;
    }
コード例 #3
0
void cuda_exit(cuda_context *ctx) {
  ASSERT_CTX(ctx);
  assert(ctx->enter > 0);
  ctx->enter--;
  if (!ctx->enter)
    cuCtxPopCurrent(NULL);
}
コード例 #4
0
    CUdeviceptr get_read_ptr_cuda(ComputeEnv *env,int devid, size_t read_byte_size) {
        if (cuda_valid_list[devid]) {
            return cuda_ptr_list[devid];
        }

        if (host_valid == false) {
            /* xx */
            abort();
            return 0;
        }

        CUDADev *dev = &env->cuda_dev_list[devid];
        cuCtxPushCurrent(dev->context);

        if (cuda_ptr_list[devid] == 0) {
            CUresult err;
            err = cuMemAlloc(&cuda_ptr_list[devid], byte_size);
            if (err != CUDA_SUCCESS) {
                abort();
            }
        }

        //double t0 = getsec();
        cuMemcpyHtoD(cuda_ptr_list[devid], host_ptr, read_byte_size);
        //double t1 = getsec();
        //env->transfer_wait = t1-t0;

        cuda_valid_list[devid] = true;

        CUcontext old;
        cuCtxPopCurrent(&old);

        return cuda_ptr_list[devid];
    }
コード例 #5
0
static void *do_init(CUdevice dev, int flags, int *ret) {
    cuda_context *res;
    CUcontext ctx;
    unsigned int fl = CU_CTX_SCHED_AUTO;
    int i;

    CHKFAIL(NULL);
    if (flags & GA_CTX_SINGLE_THREAD)
      fl = CU_CTX_SCHED_SPIN;
    if (flags & GA_CTX_MULTI_THREAD)
      fl = CU_CTX_SCHED_YIELD;
    err = cuDeviceGetAttribute(&i, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
    CHKFAIL(NULL);
    if (i != 1)
      FAIL(NULL, GA_UNSUPPORTED_ERROR);
    err = cuCtxCreate(&ctx, fl, dev);
    CHKFAIL(NULL);
    res = cuda_make_ctx(ctx, 0);
    if (res == NULL) {
      cuCtxDestroy(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    res->flags |= flags;
    /* Don't leave the context on the thread stack */
    cuCtxPopCurrent(NULL);

    return res;
}
コード例 #6
0
    bool prealloc(ComputeEnv *env) {
        int devid;
        if (host_ptr == nullptr) {
            host_ptr = _mm_malloc(byte_size, 64);
            if (host_ptr == nullptr) {
                return false;
            }
        }

        switch (env->target_processor.type) {
        case W2XCONV_PROC_HOST:
            break;

        case W2XCONV_PROC_OPENCL:
            devid = env->target_processor.devid;
            if (cl_ptr_list[devid] == nullptr) {
                cl_int err;
                OpenCLDev *dev = &env->cl_dev_list[devid];
                cl_ptr_list[devid] = clCreateBuffer(dev->context,
                                                    CL_MEM_READ_WRITE,
                                                    byte_size, nullptr, &err);
                if (cl_ptr_list[devid] == nullptr) {
                    return false;
                }

                /* touch memory to force allocation */
                char data = 0;
                err = clEnqueueWriteBuffer(dev->queue, cl_ptr_list[devid],
                                           CL_TRUE, 0, 1, &data, 0, nullptr, nullptr);
                if (err != CL_SUCCESS) {
                    clReleaseMemObject(cl_ptr_list[devid]);
                    cl_ptr_list[devid] = nullptr;
                    return false;
                }

            }
            break;

        case W2XCONV_PROC_CUDA:
            devid = env->target_processor.devid;

            if (cuda_ptr_list[devid] == 0) {
                CUresult err;
                CUDADev *dev = &env->cuda_dev_list[devid];
                cuCtxPushCurrent(dev->context);
                err = cuMemAlloc(&cuda_ptr_list[devid], byte_size);
                CUcontext old;
                cuCtxPopCurrent(&old);

                if (err != CUDA_SUCCESS) {
                    return false;
                }
            }
            break;

        }

        return true;
    }
コード例 #7
0
void GPUInterface::ResizeStreamCount(int newStreamCount) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::ResizeStreamCount\n");
#endif
    SAFE_CUDA(cuCtxPushCurrent(cudaContext));

    SAFE_CUDA(cuCtxSynchronize());

    if (cudaStreams != NULL) {
        for(int i=0; i<numStreams; i++) {
            if (cudaStreams[i] != NULL)
                SAFE_CUDA(cuStreamDestroy(cudaStreams[i]));
        }
        free(cudaStreams);
    }

    if (cudaEvents != NULL) {
        for(int i=0; i<numStreams; i++) {
            if (cudaEvents[i] != NULL)
                SAFE_CUDA(cuEventDestroy(cudaEvents[i]));
        }
        free(cudaEvents);
    }

    if (newStreamCount == 1) {
        numStreams = 1;
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        cudaStreams[0] = NULL;
        CUevent event;
        for(int i=0; i<2; i++) {
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
    } else {
        numStreams = newStreamCount;
        if (numStreams > BEAGLE_STREAM_COUNT) {
            numStreams = BEAGLE_STREAM_COUNT;
        }
        cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams);
        CUstream stream;
        cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1));
        CUevent event;
        for(int i=0; i<numStreams; i++) {
            SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT));
            cudaStreams[i] = stream;
            SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
            cudaEvents[i] = event;
        }
        SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING));
        cudaEvents[numStreams] = event;
    }

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::ResizeStreamCount\n");
#endif
}
コード例 #8
0
ファイル: ImageGL.cpp プロジェクト: KHeresy/openvr
void
ImageGL::unregisterAsCudaResource(int field_num)
{
    cuCtxPushCurrent(oContext_);
    checkCudaErrors(cuGLUnregisterBufferObject(gl_pbo_[field_num]));
    bIsCudaResource_ = false;
    cuCtxPopCurrent(NULL);
}
コード例 #9
0
CUresult
TestSAXPY( chCUDADevice *chDevice, size_t N, float alpha )
{
    CUresult status;
    CUdeviceptr dptrOut = 0;
    CUdeviceptr dptrIn = 0;
    float *hostOut = 0;
    float *hostIn = 0;

    CUDA_CHECK( cuCtxPushCurrent( chDevice->context() ) );

    CUDA_CHECK( cuMemAlloc( &dptrOut, N*sizeof(float) ) );
    CUDA_CHECK( cuMemsetD32( dptrOut, 0, N ) );
    CUDA_CHECK( cuMemAlloc( &dptrIn, N*sizeof(float) ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostOut, N*sizeof(float), 0 ) );
    CUDA_CHECK( cuMemHostAlloc( (void **) &hostIn, N*sizeof(float), 0 ) );
    for ( size_t i = 0; i < N; i++ ) {
        hostIn[i] = (float) rand() / (float) RAND_MAX;
    }
    CUDA_CHECK( cuMemcpyHtoDAsync( dptrIn, hostIn, N*sizeof(float ), NULL ) );

    {
        CUmodule moduleSAXPY;
        CUfunction kernelSAXPY;
        void *params[] = { &dptrOut, &dptrIn, &N, &alpha };
        
        moduleSAXPY = chDevice->module( "saxpy.ptx" );
        if ( ! moduleSAXPY ) {
            status = CUDA_ERROR_NOT_FOUND;
            goto Error;
        }
        CUDA_CHECK( cuModuleGetFunction( &kernelSAXPY, moduleSAXPY, "saxpy" ) );

        CUDA_CHECK( cuLaunchKernel( kernelSAXPY, 1500, 1, 1, 512, 1, 1, 0, NULL, params, NULL ) );

    }

    CUDA_CHECK( cuMemcpyDtoHAsync( hostOut, dptrOut, N*sizeof(float), NULL ) );
    CUDA_CHECK( cuCtxSynchronize() );
    for ( size_t i = 0; i < N; i++ ) {
        if ( fabsf( hostOut[i] - alpha*hostIn[i] ) > 1e-5f ) {
            status = CUDA_ERROR_UNKNOWN;
            goto Error;
        }
    }
    status = CUDA_SUCCESS;
    printf( "Well it worked!\n" );

Error:
    cuCtxPopCurrent( NULL );
    cuMemFreeHost( hostOut );
    cuMemFreeHost( hostIn );
    cuMemFree( dptrOut );
    cuMemFree( dptrIn );
    return status;
}
コード例 #10
0
int gib_alloc ( void **buffers, int buf_size, int *ld, gib_context c ) {
  ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx));
#if GIB_USE_MMAP
  ERROR_CHECK_FAIL(cuMemHostAlloc(buffers, (c->n+c->m)*buf_size, 
				  CU_MEMHOSTALLOC_DEVICEMAP));
#else
  ERROR_CHECK_FAIL(cuMemAllocHost(buffers, (c->n+c->m)*buf_size));
#endif
  *ld = buf_size;
  ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx));
  return GIB_SUC;
}
コード例 #11
0
void GPUInterface::LaunchKernel(GPUFunction deviceFunction,
                                Dim3Int block,
                                Dim3Int grid,
                                int parameterCountV,
                                int totalParameterCount,
                                ...) { // parameters
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::LaunchKernel\n");
#endif


    SAFE_CUDA(cuCtxPushCurrent(cudaContext));

    SAFE_CUDA(cuFuncSetBlockShape(deviceFunction, block.x, block.y, block.z));

    int offset = 0;
    va_list parameters;
    va_start(parameters, totalParameterCount);
    for(int i = 0; i < parameterCountV; i++) {
        void* param = (void*)(size_t)va_arg(parameters, GPUPtr);

        // adjust offset alignment requirements
        offset = (offset + __alignof(param) - 1) & ~(__alignof(param) - 1);

        SAFE_CUDA(cuParamSetv(deviceFunction, offset, &param, sizeof(param)));

        offset += sizeof(void*);
    }
    for(int i = parameterCountV; i < totalParameterCount; i++) {
        unsigned int param = va_arg(parameters, unsigned int);

        // adjust offset alignment requirements
        offset = (offset + __alignof(param) - 1) & ~(__alignof(param) - 1);

        SAFE_CUDA(cuParamSeti(deviceFunction, offset, param));

        offset += sizeof(param);

    }

    va_end(parameters);

    SAFE_CUDA(cuParamSetSize(deviceFunction, offset));

    SAFE_CUDA(cuLaunchGrid(deviceFunction, grid.x, grid.y));

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::LaunchKernel\n");
#endif

}
コード例 #12
0
ファイル: context.c プロジェクト: PachoAlvarez/RCUDA
SEXP
R_cuCtxPopCurrent()
{
    SEXP r_ans = R_NilValue;
    CUcontext pctx;
    CUresult ans;
    ans = cuCtxPopCurrent(& pctx);
    if(ans)
       return(R_cudaErrorInfo(ans));
    r_ans = R_createRef(pctx, "CUcontext") ;
    return(r_ans);
}
コード例 #13
0
/*
 * Finalizes CUPTI device.
 * 
 * @param ptid VampirTrace process/thread id
 * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0
 */
void vt_cuptievt_finalize_device(uint8_t cleanExit){
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  vt_cupti_ctx_t *vtcuptiCtx = NULL;

  vt_cntl_msg(2, "[CUPTI Events] Finalize device ... ");

  {
    CUcontext cuCtx;
    
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
    VT_CUDRV_CALL(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
    VT_CUDRV_CALL(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
#else
    VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
#endif

    vtcuptiCtx = vt_cupti_removeCtx(&cuCtx);
    if(vtcuptiCtx == NULL) 
      return;
  }
  
  if(vtcuptiCtx->events == NULL)
    return;

  if(cleanExit && vt_gpu_debug != 0){
    /*uint64_t time = vt_pform_wtime();

    vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/

    /* stop CUPTI counter capturing */
    vt_cuptievt_stop(vtcuptiCtx->events);

    /* destroy all CUPTI event groups, which have been created */
    {
      vt_cupti_evtgrp_t *vtcuptiGrp = vtcuptiCtx->events->vtGrpList;

      while(vtcuptiGrp != NULL){
        cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp);
        VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupRemoveAllEvents");

        cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp);
        VT_CUPTI_CALL(cuptiErr, "cuptiEventGroupDestroy");

        vtcuptiGrp = vtcuptiGrp->next;
      }
    }
  }

  /* free VampirTrace CUPTI event context */
  vt_cuptievt_freeEventCtx(vtcuptiCtx->events);
}
コード例 #14
0
ファイル: vt_cudacupti.c プロジェクト: bringhurst/ompi
/*
 * Finalizes CUPTI device.
 * 
 * @param cleanExit 1 to cleanup CUPTI event group, otherwise 0
 */
void vt_cupti_finalize_device(uint32_t ptid, uint8_t cleanExit){
  CUptiResult cuptiErr = CUPTI_SUCCESS;
  vt_cupti_ctx_t *vtcuptiCtx = NULL;

  vt_cntl_msg(2, "[CUPTI] Finalize device ... ");

  {
    CUcontext cuCtx = NULL;

    VT_SUSPEND_CUDA_TRACING(ptid);
    
#if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
    CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
    CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
#else
    CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
#endif
    
    VT_RESUME_CUDA_TRACING(ptid);

    vtcuptiCtx = vt_cupti_takeCtxFromList(cuCtx);
    if(vtcuptiCtx == NULL) return;
  }

  if(cleanExit && vt_gpu_debug != 0){
    /*uint64_t time = vt_pform_wtime();

    vt_cupti_resetCounter(vtcuptiCtx, 0, &time);*/

    /* stop CUPTI counter capturing */
    vt_cupti_stop(vtcuptiCtx);

    /* destroy all CUPTI event groups, which have been created */
    {
      vt_cupti_grp_t *vtcuptiGrp = vtcuptiCtx->vtGrpList;

      while(vtcuptiGrp != NULL){
        cuptiErr = cuptiEventGroupRemoveAllEvents(vtcuptiGrp->evtGrp);
        CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupRemoveAllEvents");

        cuptiErr = cuptiEventGroupDestroy(vtcuptiGrp->evtGrp);
        CHECK_CUPTI_ERROR(cuptiErr, "cuptiEventGroupDestroy");

        vtcuptiGrp = vtcuptiGrp->next;
      }
    }
  }

  /* free VampirTrace CUPTI context */
  vt_cupti_freeCtx(vtcuptiCtx);
}
コード例 #15
0
void GPUInterface::LaunchKernel(GPUFunction deviceFunction,
                                         Dim3Int block,
                                         Dim3Int grid,
                                         int parameterCountV,
                                         int totalParameterCount,
                                         ...) { // parameters
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::LaunchKernel\n");
#endif

    SAFE_CUDA(cuCtxPushCurrent(cudaContext));

    void** params;
    GPUPtr* paramPtrs;
    unsigned int* paramInts;

    params = (void**)malloc(sizeof(void*) * totalParameterCount);
    paramPtrs = (GPUPtr*)malloc(sizeof(GPUPtr) * totalParameterCount);
    paramInts = (unsigned int*)malloc(sizeof(unsigned int) * totalParameterCount);

    va_list parameters;
    va_start(parameters, totalParameterCount);
    for(int i = 0; i < parameterCountV; i++) {
       paramPtrs[i] = (GPUPtr)(size_t)va_arg(parameters, GPUPtr);
       params[i] = (void*)&paramPtrs[i];
    }
    for(int i = parameterCountV; i < totalParameterCount; i++) {
       paramInts[i-parameterCountV] = va_arg(parameters, unsigned int);
       params[i] = (void*)&paramInts[i-parameterCountV];
    }

    va_end(parameters);

    SAFE_CUDA(cuLaunchKernel(deviceFunction, grid.x, grid.y, grid.z,
                             block.x, block.y, block.z, 0,
                             cudaStreams[0], params, NULL));

    free(params);
    free(paramPtrs);
    free(paramInts);

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::LaunchKernel\n");
#endif

}
コード例 #16
0
	// Release all previously initd objects
	HRESULT CudaVideoRender::cleanup(bool bDestroyContext)
	{
		// Attach the CUDA Context (so we may properly free memroy)
		cutilDrvSafeCallNoSync( cuCtxPushCurrent(m_cuContext) );

		if (m_pInteropFrame[0]) {
			cutilDrvSafeCallNoSync( cuMemFree(m_pInteropFrame[0]) );
		}
		if (m_pInteropFrame[1]) {
			cutilDrvSafeCallNoSync( cuMemFree(m_pInteropFrame[1]) );
		}
		// Detach from the Current thread
		cutilDrvSafeCallNoSync( cuCtxPopCurrent(NULL) );
		terminateCudaVideo(bDestroyContext);
		return S_OK;
	}
コード例 #17
0
ファイル: SurfaceInteropCUDA.cpp プロジェクト: ntibor22/QtAV
bool EGLInteropResource::ensureD3D9CUDA(int w, int h, int W, int H)
{
    TexRes &r = res[0];// 1 NV12 texture
    if (r.w == w && r.h == h && r.W == W && r.H == H && r.cuRes)
        return true;
    if (!ctx) {
        // TODO: how to use pop/push decoder's context without the context in opengl context
        if (!ensureD3DDevice())
            return false;
        // CUdevice is different from decoder's
        CUDA_ENSURE(cuD3D9CtxCreate(&ctx, &dev, CU_CTX_SCHED_BLOCKING_SYNC, device9), false);
#if USE_STREAM
        CUDA_WARN(cuStreamCreate(&res[0].stream, CU_STREAM_DEFAULT));
        CUDA_WARN(cuStreamCreate(&res[1].stream, CU_STREAM_DEFAULT));
#endif //USE_STREAM
        qDebug("cuda contex on gl thread: %p", ctx);
        CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // TODO: why cuMemcpy2D need this
    }
    if (r.cuRes) {
        CUDA_ENSURE(cuGraphicsUnregisterResource(r.cuRes), false);
        r.cuRes = NULL;
    }

    // create d3d resource for interop
    if (!surface9_nv12) {
        // TODO: need pitch from cuvid to ensure cuMemcpy2D can copy the whole pitch
        DX_ENSURE(device9->CreateTexture(W
                                         //, H
                                         , H*3/2
                                         , 1
                                         , D3DUSAGE_DYNAMIC //D3DUSAGE_DYNAMIC is lockable // 0 is from NV example. cudaD3D9.h says The primary rendertarget may not be registered with CUDA. So can not be D3DUSAGE_RENDERTARGET?
                                         //, D3DUSAGE_RENDERTARGET
                                         , D3DFMT_L8
                                         //, (D3DFORMAT)MAKEFOURCC('N','V','1','2') // can not create nv12. use 2 textures L8+A8L8?
                                         , D3DPOOL_DEFAULT // must be D3DPOOL_DEFAULT for cuda?
                                         , &texture9_nv12
                                         , NULL) // - Resources allocated as shared may not be registered with CUDA.
                  , false);
        DX_ENSURE(device9->CreateOffscreenPlainSurface(W, H, (D3DFORMAT)MAKEFOURCC('N','V','1','2'), D3DPOOL_DEFAULT, &surface9_nv12, NULL), false);
    }

    // TODO: cudaD3D9.h says NV12 is not supported
    // CUDA_ERROR_INVALID_HANDLE if register D3D9 surface
    // TODO: why flag CU_GRAPHICS_REGISTER_FLAGS_WRITE_DISCARD is invalid while it's fine for opengl
    CUDA_ENSURE(cuGraphicsD3D9RegisterResource(&r.cuRes, texture9_nv12, CU_GRAPHICS_REGISTER_FLAGS_NONE), false);
    return true;
}
コード例 #18
0
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount,
                             long flags) {
#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n");
#endif

    SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber]));

    if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) {
        SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO | CU_CTX_MAP_HOST, cudaDevice));
    } else {
        SAFE_CUDA(cuCtxCreate(&cudaContext, CU_CTX_SCHED_AUTO, cudaDevice));
    }


    if (kernelMap == NULL) {
        // kernels have not yet been initialized; do so now.  Hopefully, this only occurs once per library load.
        InitializeKernelMap();
    }

    int id = paddedStateCount;
    if (flags & BEAGLE_FLAG_PRECISION_DOUBLE) {
        id *= -1;
    }

    if (kernelMap->count(id) == 0) {
        fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount);
        exit(-1);
    }

    kernelResource = (*kernelMap)[id].copy();
    kernelResource->categoryCount = categoryCount;
    kernelResource->patternCount = paddedPatternCount;
    kernelResource->flags = flags;

    SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode));

    SAFE_CUDA(cuCtxPopCurrent(&cudaContext));

#ifdef BEAGLE_DEBUG_FLOW
    fprintf(stderr,"\t\t\tLeaving  GPUInterface::SetDevice\n");
#endif

}
コード例 #19
0
ファイル: vf_scale_cuda.c プロジェクト: jfiguinha/Regards
static int cudascale_filter_frame(AVFilterLink *link, AVFrame *in)
{
    AVFilterContext              *ctx = link->dst;
    CUDAScaleContext               *s = ctx->priv;
    AVFilterLink             *outlink = ctx->outputs[0];
    AVHWFramesContext     *frames_ctx = (AVHWFramesContext*)s->frames_ctx->data;
    AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx;

    AVFrame *out = NULL;
    CUresult err;
    CUcontext dummy;
    int ret = 0;

    out = av_frame_alloc();
    if (!out) {
        ret = AVERROR(ENOMEM);
        goto fail;
    }

    err = cuCtxPushCurrent(device_hwctx->cuda_ctx);
    if (err != CUDA_SUCCESS) {
        ret = AVERROR_UNKNOWN;
        goto fail;
    }

    ret = cudascale_scale(ctx, out, in);

    cuCtxPopCurrent(&dummy);
    if (ret < 0)
        goto fail;

    av_reduce(&out->sample_aspect_ratio.num, &out->sample_aspect_ratio.den,
              (int64_t)in->sample_aspect_ratio.num * outlink->h * link->w,
              (int64_t)in->sample_aspect_ratio.den * outlink->w * link->h,
              INT_MAX);

    av_frame_free(&in);
    return ff_filter_frame(outlink, out);
fail:
    av_frame_free(&in);
    av_frame_free(&out);
    return ret;
}
コード例 #20
0
static CUdevice get_device_from_ctx(CUcontext ctx)
{

    // Strangely, there does not seem to be a way to get this from the
    // context without making it current. Feels hacky, possibly
    // subject to future change.
    CUcontext curCtx = 0;
    CUdevice device = 0;
    cuCtxGetCurrent(&curCtx);
    if (curCtx != ctx) {
        cuCtxPushCurrent(ctx);
    }
    cuCtxGetDevice(&device);
    if (curCtx != ctx) {
        cuCtxPopCurrent(NULL);
    }

    return device;
}
コード例 #21
0
static void *cuda_init(int ord, int flags, int *ret) {
    CUdevice dev;
    CUcontext ctx;
    cuda_context *res;
    static int init_done = 0;
    unsigned int fl = CU_CTX_SCHED_AUTO;

    if (ord == -1) {
      /* Grab the ambient context */
      err = cuCtxGetCurrent(&ctx);
      CHKFAIL(NULL);
      res = cuda_make_ctx(ctx, DONTFREE);
      if (res == NULL) {
        FAIL(NULL, GA_IMPL_ERROR);
      }
      res->flags |= flags;
      return res;
    }

    if (!init_done) {
      err = cuInit(0);
      CHKFAIL(NULL);
      init_done = 1;
    }
    err = cuDeviceGet(&dev, ord);
    CHKFAIL(NULL);
    if (flags & GA_CTX_SINGLE_THREAD)
      fl = CU_CTX_SCHED_SPIN;
    if (flags & GA_CTX_MULTI_THREAD)
      fl = CU_CTX_SCHED_YIELD;
    err = cuCtxCreate(&ctx, fl, dev);
    CHKFAIL(NULL);
    res = cuda_make_ctx(ctx, 0);
    res->flags |= flags;
    if (res == NULL) {
      cuCtxDestroy(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    /* Don't leave the context on the thread stack */
    cuCtxPopCurrent(NULL);
    return res;
}
コード例 #22
0
/*
 * Retrieves the VampirTrace CUPTI context for the CUDA context associated with
 * the calling host thread. Initiates context creation, if it is not available 
 * yet.
 *
 * @param ptid the VampirTrace thread id of the calling host thread
 * 
 * @return VampirTrace CUPTI context
 */
vt_cupti_ctx_t* vt_cuptievt_getOrCreateCurrentCtx(uint32_t ptid)
{
  CUcontext cuCtx = NULL;
  
  if(!vt_cuptievt_initialized) vt_cupti_events_init();

# if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
  VT_CUDRV_CALL(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
  VT_CUDRV_CALL(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
# else
  VT_CUDRV_CALL(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
# endif
  
  if(cuCtx == NULL){
    vt_cntl_msg(2, "[CUPTI Events] No context is bound to the calling CPU thread!");
    return NULL;
  }
  
  return vt_cuptievt_getOrCreateCtx(cuCtx, ptid);
}
コード例 #23
0
ファイル: cudacompil.cpp プロジェクト: flintforge/baracuda
/*
 * returns the cubin = gpu machine code  form the PTX ISA assembly
 * setup JIT compilation options and perform compilation
 */
CUmodule * CudaCompiler::compilePTX(uchar * KernelPTXDump, GPU * gpu=0)
{
    // consider the default context is in run if gpu==0
    // TODO : make it mandatory
    if(gpu!=0) CUDCHK( cuCtxPushCurrent(gpu->context) );

    //CUDCHK( cuCtxSynchronize());

    // in this branch we use compilation with parameters
    const unsigned int jitNumOptions = 3;
    int jitLogBufferSize = 1024;
    int jitRegCount = 32;

    CUjit_option *  jitOptions   = new CUjit_option[jitNumOptions];
    void **         jitOptVals   = new void*[jitNumOptions];
    char *          jitLogBuffer = new char[jitLogBufferSize];

    jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;// set up size of compilation log buffer
    jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;// set up pointer to the compilation log buffer
    jitOptions[2] = CU_JIT_MAX_REGISTERS;  // set up pointer to set the Maximum # of registers for a particular kernel

    jitOptVals[0] = (void *)(size_t)jitLogBufferSize;
    jitOptVals[1] = jitLogBuffer;
    jitOptVals[2] = (void *)(size_t)jitRegCount;

    // compile with set parameters
    qDebug()<<"> Compiling PTX module";
    CUmodule * cuModule = new CUmodule();

    CUDCHK( cuModuleLoadDataEx( cuModule, KernelPTXDump, jitNumOptions, jitOptions, (void **)jitOptVals) );

    qDebug()<< "PTX JIT log: \n [" <<  jitLogBuffer <<"]" ;

    delete [] jitOptions;
    delete [] jitOptVals;
    delete [] jitLogBuffer;

    CUDCHK( cuCtxPopCurrent(0));

    return cuModule;
}
コード例 #24
0
ファイル: videoDecodeD3D9.cpp プロジェクト: huoyao/cudasdk
// Release all previously initd objects
HRESULT cleanup(bool bDestroyContext)
{
    // Attach the CUDA Context (so we may properly free memroy)
    if (bDestroyContext)
    {
        checkCudaErrors(cuCtxPushCurrent(g_oContext));

        if (g_pInteropFrame[0])
        {
            checkCudaErrors(cuMemFree(g_pInteropFrame[0]));
        }

        if (g_pInteropFrame[1])
        {
            checkCudaErrors(cuMemFree(g_pInteropFrame[1]));
        }

        // Detach from the Current thread
        checkCudaErrors(cuCtxPopCurrent(NULL));
    }

    if (g_pImageDX)
    {
        delete g_pImageDX;
        g_pImageDX = NULL;
    }

    freeCudaResources(bDestroyContext);

    // destroy the D3D device
    if (g_pD3DDevice)
    {
        g_pD3DDevice->Release();
        g_pD3DDevice = NULL;
    }


    return S_OK;
}
コード例 #25
0
ファイル: plugin-nvptx.c プロジェクト: chinabin/gcc-tiny
static void
nvptx_attach_host_thread_to_device (int n)
{
  CUdevice dev;
  CUresult r;
  struct ptx_device *ptx_dev;
  CUcontext thd_ctx;

  r = cuCtxGetDevice (&dev);
  if (r != CUDA_SUCCESS && r != CUDA_ERROR_INVALID_CONTEXT)
    GOMP_PLUGIN_fatal ("cuCtxGetDevice error: %s", cuda_error (r));

  if (r != CUDA_ERROR_INVALID_CONTEXT && dev == n)
    return;
  else
    {
      CUcontext old_ctx;

      ptx_dev = ptx_devices[n];
      assert (ptx_dev);

      r = cuCtxGetCurrent (&thd_ctx);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuCtxGetCurrent error: %s", cuda_error (r));

      /* We don't necessarily have a current context (e.g. if it has been
         destroyed.  Pop it if we do though.  */
      if (thd_ctx != NULL)
	{
	  r = cuCtxPopCurrent (&old_ctx);
	  if (r != CUDA_SUCCESS)
            GOMP_PLUGIN_fatal ("cuCtxPopCurrent error: %s", cuda_error (r));
	}

      r = cuCtxPushCurrent (ptx_dev->ctx);
      if (r != CUDA_SUCCESS)
        GOMP_PLUGIN_fatal ("cuCtxPushCurrent error: %s", cuda_error (r));
    }
}
コード例 #26
0
ファイル: SurfaceInteropCUDA.cpp プロジェクト: ntibor22/QtAV
bool GLInteropResource::unmap(GLuint tex)
{
    Q_UNUSED(tex);
    if (WORKAROUND_UNMAP_CONTEXT_SWITCH)
        return true;
    int plane = -1;
    if (res[0].texture == tex)
        plane = 0;
    else if (res[1].texture == tex)
        plane = 1;
    else
        return false;
    // FIXME: why cuCtxPushCurrent gives CUDA_ERROR_INVALID_CONTEXT if opengl viewport changed?
    CUDA_WARN(cuCtxPushCurrent(ctx));
    CUDA_WARN(cuStreamSynchronize(res[plane].stream));
    // FIXME: need a correct context. But why we have to push context even though map/unmap are called in the same thread
    // Because the decoder switch the context in another thread so we have to switch the context back?
    // to workaround the context issue, we must pop the context that valid in map() and push it here
    CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
    CUDA_ENSURE(cuCtxPopCurrent(&ctx), false);
    return true;
}
コード例 #27
0
ファイル: VideoDecoderCUDA.cpp プロジェクト: NickD2039/QtAV
bool VideoDecoderCUDAPrivate::initCuda()
{
    CUresult result = cuInit(0);
    if (result != CUDA_SUCCESS) {
        available = false;
        qWarning("cuInit(0) faile (%d)", result);
        return false;
    }
    cudev = GetMaxGflopsGraphicsDeviceId();

    int clockRate;
    cuDeviceGetAttribute(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, cudev);
    int major, minor;
    cuDeviceComputeCapability(&major, &minor, cudev);
    char devname[256];
    cuDeviceGetName(devname, 256, cudev);
    description = QString("CUDA device: %1 %2.%3 %4 MHz").arg(devname).arg(major).arg(minor).arg(clockRate/1000);

    //TODO: cuD3DCtxCreate > cuGLCtxCreate > cuCtxCreate
    checkCudaErrors(cuCtxCreate(&cuctx, CU_CTX_SCHED_BLOCKING_SYNC, cudev)); //CU_CTX_SCHED_AUTO?
    CUcontext cuCurrent = NULL;
    result = cuCtxPopCurrent(&cuCurrent);
    if (result != CUDA_SUCCESS) {
        qWarning("cuCtxPopCurrent: %d\n", result);
        return false;
    }
    checkCudaErrors(cuvidCtxLockCreate(&vid_ctx_lock, cuctx));
    {
        AutoCtxLock lock(this, vid_ctx_lock);
        Q_UNUSED(lock);
        //Flags- Parameters for stream creation (must be 0 (CU_STREAM_DEFAULT=0 in cuda5) in cuda 4.2, no CU_STREAM_NON_BLOCKING)
        checkCudaErrors(cuStreamCreate(&stream, 0));//CU_STREAM_NON_BLOCKING)); //CU_STREAM_DEFAULT
        //require compute capability >= 1.1
        //flag: Reserved for future use, must be 0
        //cuStreamAddCallback(stream, CUstreamCallback, this, 0);
    }
    return true;
}
コード例 #28
0
    CUdeviceptr get_write_ptr_cuda(ComputeEnv *env,int devid) {
        invalidate(env);

        CUDADev *dev = &env->cuda_dev_list[devid];
        cuCtxPushCurrent(dev->context);

        if (cuda_ptr_list[devid] == 0) {
            CUresult err;
            err = cuMemAlloc(&cuda_ptr_list[devid], byte_size);
            if (err != CUDA_SUCCESS) {
                abort();
            }
        }

        last_write.type = Processor::CUDA;
        last_write.devid = devid;

        cuda_valid_list[devid] = true;
        CUcontext old;
        cuCtxPopCurrent(&old);

        return cuda_ptr_list[devid];
    }
コード例 #29
0
ファイル: vt_cudacupti.c プロジェクト: bringhurst/ompi
/*
 * Returns the VampirTrace CUPTI context for the CUDA context associated with
 * the calling host thread.
 *
 * @param ptid the VampirTrace thread id of the calling host thread
 */
vt_cupti_ctx_t* vt_cupti_getCurrentContext(uint32_t ptid)
{
  CUcontext cuCtx = NULL;
  
  if(!vt_cupti_initialized) vt_cupti_init();

  VT_SUSPEND_CUDA_TRACING(ptid);

# if (defined(CUDA_VERSION) && (CUDA_VERSION < 4000))
  CHECK_CU_ERROR(cuCtxPopCurrent(&cuCtx), "cuCtxPopCurrent");
  CHECK_CU_ERROR(cuCtxPushCurrent(cuCtx), "cuCtxPushCurrent");
# else
  CHECK_CU_ERROR(cuCtxGetCurrent(&cuCtx), "cuCtxGetCurrent");
# endif

  VT_RESUME_CUDA_TRACING(ptid);
  
  if(cuCtx == NULL) {
    vt_cntl_msg(2, "[CUPTI] No context is bound to the calling CPU thread", cuCtx);
    return NULL;
  }
  
  return vt_cupti_getCtx(cuCtx, ptid);
}
コード例 #30
0
ファイル: cuda.cpp プロジェクト: bnascimento/Halide
    ~CudaContext() {
        CUcontext old;
        cuCtxPopCurrent(&old);

        halide_release_cuda_context(user_context);
    }