void GPUInterface::LaunchKernelConcurrent(GPUFunction deviceFunction, Dim3Int block, Dim3Int grid, int streamIndex, int waitIndex, int parameterCountV, int totalParameterCount, ...) { // parameters #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::LaunchKernelConcurrent\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); void** params; GPUPtr* paramPtrs; unsigned int* paramInts; params = (void**)malloc(sizeof(void*) * totalParameterCount); paramPtrs = (GPUPtr*)malloc(sizeof(GPUPtr) * totalParameterCount); paramInts = (unsigned int*)malloc(sizeof(unsigned int) * totalParameterCount); va_list parameters; va_start(parameters, totalParameterCount); for(int i = 0; i < parameterCountV; i++) { paramPtrs[i] = (GPUPtr)(size_t)va_arg(parameters, GPUPtr); params[i] = (void*)¶mPtrs[i]; } for(int i = parameterCountV; i < totalParameterCount; i++) { paramInts[i-parameterCountV] = va_arg(parameters, unsigned int); params[i] = (void*)¶mInts[i-parameterCountV]; } va_end(parameters); if (streamIndex >= 0) { int streamIndexMod = streamIndex % numStreams; if (waitIndex >= 0) { int waitIndexMod = waitIndex % numStreams; SAFE_CUDA(cuStreamWaitEvent(cudaStreams[streamIndexMod], cudaEvents[waitIndexMod], 0)); } SAFE_CUDA(cuLaunchKernel(deviceFunction, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, cudaStreams[streamIndexMod], params, NULL)); SAFE_CUDA(cuEventRecord(cudaEvents[streamIndexMod], cudaStreams[streamIndexMod])); } else { SAFE_CUDA(cuLaunchKernel(deviceFunction, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, cudaStreams[0], params, NULL)); } free(params); free(paramPtrs); free(paramInts); SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::LaunchKernelConcurrent\n"); #endif }
int gib_recover ( void *buffers, int buf_size, int *buf_ids, int recover_last, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); #if !GIB_USE_MMAP if (buf_size > gib_buf_size) { int rc = gib_cpu_recover(buffers, buf_size, buf_ids, recover_last, c); ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return rc; } #endif int i, j; int n = c->n; int m = c->m; unsigned char A[128*128], inv[128*128], modA[128*128]; for (i = n; i < n+recover_last; i++) if (buf_ids[i] >= n) { fprintf(stderr, "Attempting to recover a parity buffer, not allowed\n"); return GIB_ERR; } gib_galois_gen_A(A, m+n, n); /* Modify the matrix to have the failed drives reflected */ for (i = 0; i < n; i++) for (j = 0; j < n; j++) modA[i*n+j] = A[buf_ids[i]*n+j]; gib_galois_gaussian_elim(modA, inv, n, n); /* Copy row buf_ids[i] into row i */ for (i = n; i < n+recover_last; i++) for (j = 0; j < n; j++) modA[i*n+j] = inv[buf_ids[i]*n+j]; int nthreads_per_block = 128; int fetch_size = sizeof(int)*nthreads_per_block; int nblocks = (buf_size + fetch_size - 1)/fetch_size; gpu_context gpu_c = (gpu_context) c->acc_context; CUdeviceptr F_d; ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, modA+n*n, (c->m)*(c->n))); #if !GIB_USE_MMAP ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size)); #endif ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->recover, nthreads_per_block, 1, 1)); int offset = 0; void *ptr; #if GIB_USE_MMAP CUdeviceptr cpu_buffers; ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0)); ptr = (void *)cpu_buffers; #else ptr = (void *)gpu_c->buffers; #endif ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &ptr, sizeof(ptr))); offset += sizeof(ptr); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &buf_size, sizeof(buf_size))); offset += sizeof(buf_size); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->recover, offset, &recover_last, sizeof(recover_last))); offset += sizeof(recover_last); ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->recover, offset)); ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->recover, nblocks, 1)); #if !GIB_USE_MMAP CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size; void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size); ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, recover_last*buf_size)); #else cuCtxSynchronize(); #endif ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
// Run the Cuda part of the computation bool CudaVideoRender::copyDecodedFrameToTexture(unsigned int &nRepeats, int bUseInterop, int *pbIsProgressive) { CUVIDPARSERDISPINFO oDisplayInfo; if (m_pFrameQueue->dequeue(&oDisplayInfo)) { CCtxAutoLock lck ( m_CtxLock ); // Push the current CUDA context (only if we are using CUDA decoding path) CUresult result = cuCtxPushCurrent(m_cuContext); CUdeviceptr pDecodedFrame[2] = { 0, 0 }; CUdeviceptr pInteropFrame[2] = { 0, 0 }; int num_fields = (oDisplayInfo.progressive_frame ? (1) : (2+oDisplayInfo.repeat_first_field)); *pbIsProgressive = oDisplayInfo.progressive_frame; m_bIsProgressive = oDisplayInfo.progressive_frame ? true : false; for (int active_field=0; active_field<num_fields; active_field++) { nRepeats = oDisplayInfo.repeat_first_field; CUVIDPROCPARAMS oVideoProcessingParameters; memset(&oVideoProcessingParameters, 0, sizeof(CUVIDPROCPARAMS)); oVideoProcessingParameters.progressive_frame = oDisplayInfo.progressive_frame; oVideoProcessingParameters.second_field = active_field; oVideoProcessingParameters.top_field_first = oDisplayInfo.top_field_first; oVideoProcessingParameters.unpaired_field = (num_fields == 1); unsigned int nDecodedPitch = 0; unsigned int nWidth = 0; unsigned int nHeight = 0; // map decoded video frame to CUDA surfae m_pVideoDecoder->mapFrame(oDisplayInfo.picture_index, (unsigned int*)&pDecodedFrame[active_field], &nDecodedPitch, &oVideoProcessingParameters); nWidth = m_pVideoDecoder->targetWidth(); nHeight = m_pVideoDecoder->targetHeight(); // map DirectX texture to CUDA surface unsigned int nTexturePitch = 0; // If we are Encoding and this is the 1st Frame, we make sure we allocate system memory for readbacks if (m_bReadback && m_bFirstFrame && m_ReadbackSID) { CUresult result; cutilDrvSafeCallNoSync( result = cuMemAllocHost( (void **)&m_bFrameData[0], (nDecodedPitch * nHeight * 3 / 2) ) ); cutilDrvSafeCallNoSync( result = cuMemAllocHost( (void **)&m_bFrameData[1], (nDecodedPitch * nHeight * 3 / 2) ) ); m_bFirstFrame = false; if (result != CUDA_SUCCESS) { printf("cuMemAllocHost returned %d\n", (int)result); } } // If streams are enabled, we can perform the readback to the host while the kernel is executing if (m_bReadback && m_ReadbackSID) { //TODO: test if &m_bFrameData[active_field] is the correct void* CUresult result = cuMemcpyDtoHAsync(&m_bFrameData[active_field], pDecodedFrame[active_field], (nDecodedPitch * nHeight * 3 / 2), m_ReadbackSID); if (result != CUDA_SUCCESS) { printf("cuMemAllocHost returned %d\n", (int)result); } } #if ENABLE_DEBUG_OUT printf("%s = %02d, PicIndex = %02d, OutputPTS = %08d\n", (oDisplayInfo.progressive_frame ? "Frame" : "Field"), m_nDecodeFrameCount, oDisplayInfo.picture_index, oDisplayInfo.timestamp); #endif if (true) { // map the texture surface //m_pImageDX->map(&pInteropFrame[active_field], &nTexturePitch, active_field); //TODO: map interop frames to d3d9surface map(&pInteropFrame[active_field], &nTexturePitch, active_field); } else { pInteropFrame[active_field] = m_pInteropFrame[active_field]; nTexturePitch = m_pVideoDecoder->targetWidth() * 2; } // perform post processing on the CUDA surface (performs colors space conversion and post processing) // comment this out if we inclue the line of code seen above cudaPostProcessFrame(&pDecodedFrame[active_field], nDecodedPitch, &pInteropFrame[active_field], nTexturePitch, m_pCudaModule->getModule(), m_fpNV12toARGB, m_KernelSID); if (true) { // unmap the texture surface //m_pImageDX->unmap(active_field); //TODO: map interop frames to d3d9surface unmap(active_field); } // unmap video frame // unmapFrame() synchronizes with the VideoDecode API (ensures the frame has finished decoding) m_pVideoDecoder->unmapFrame((unsigned int*)&pDecodedFrame[active_field]); // release the frame, so it can be re-used in decoder m_pFrameQueue->releaseFrame(&oDisplayInfo); m_nDecodeFrameCount++; } // Detach from the Current thread cutilDrvSafeCallNoSync( cuCtxPopCurrent(NULL) ); } else { return false; } // check if decoding has come to an end. // if yes, signal the app to shut down. if (!m_pVideoSource->isStarted() || m_pFrameQueue->isEndOfDecode()) { // Let's free the Frame Data if (m_ReadbackSID && m_bFrameData) { cuMemFreeHost((void *)m_bFrameData[0]); cuMemFreeHost((void *)m_bFrameData[1]); m_bFrameData[0] = NULL; m_bFrameData[1] = NULL; } // Let's just stop, and allow the user to quit, so they can at least see the results m_pVideoSource->stop(); // If we want to loop reload the video file and restart if (m_bLoop && !m_bAutoQuit) { reinitCudaResources(); m_nFrameCount = 0; m_nDecodeFrameCount = 0; m_pVideoSource->start(); } if (m_bAutoQuit) { m_bDone = true; } } return true; }
int gib_generate ( void *buffers, int buf_size, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); /* Do it all at once if the buffers are small enough */ #if !GIB_USE_MMAP /* This is too large to do at once in the GPU memory we have allocated. * Split it into several noncontiguous jobs. */ if (buf_size > gib_buf_size) { int rc = gib_generate_nc(buffers, buf_size, buf_size, c); ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return rc; } #endif int nthreads_per_block = 128; int fetch_size = sizeof(int)*nthreads_per_block; int nblocks = (buf_size + fetch_size - 1)/fetch_size; gpu_context gpu_c = (gpu_context) c->acc_context; unsigned char F[256*256]; gib_galois_gen_F(F, c->m, c->n); CUdeviceptr F_d; ERROR_CHECK_FAIL(cuModuleGetGlobal(&F_d, NULL, gpu_c->module, "F_d")); ERROR_CHECK_FAIL(cuMemcpyHtoD(F_d, F, (c->m)*(c->n))); #if !GIB_USE_MMAP /* Copy the buffers to memory */ ERROR_CHECK_FAIL(cuMemcpyHtoD(gpu_c->buffers, buffers, (c->n)*buf_size)); #endif /* Configure and launch */ ERROR_CHECK_FAIL(cuFuncSetBlockShape(gpu_c->checksum, nthreads_per_block, 1, 1)); int offset = 0; void *ptr; #if GIB_USE_MMAP CUdeviceptr cpu_buffers; ERROR_CHECK_FAIL(cuMemHostGetDevicePointer(&cpu_buffers, buffers, 0)); ptr = (void *)cpu_buffers; #else ptr = (void *)(gpu_c->buffers); #endif ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &ptr, sizeof(ptr))); offset += sizeof(ptr); ERROR_CHECK_FAIL(cuParamSetv(gpu_c->checksum, offset, &buf_size, sizeof(buf_size))); offset += sizeof(buf_size); ERROR_CHECK_FAIL(cuParamSetSize(gpu_c->checksum, offset)); ERROR_CHECK_FAIL(cuLaunchGrid(gpu_c->checksum, nblocks, 1)); /* Get the results back */ #if !GIB_USE_MMAP CUdeviceptr tmp_d = gpu_c->buffers + c->n*buf_size; void *tmp_h = (void *)((unsigned char *)(buffers) + c->n*buf_size); ERROR_CHECK_FAIL(cuMemcpyDtoH(tmp_h, tmp_d, (c->m)*buf_size)); #else ERROR_CHECK_FAIL(cuCtxSynchronize()); #endif ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
int gib_free ( void *buffers, gib_context c ) { ERROR_CHECK_FAIL(cuCtxPushCurrent(((gpu_context)(c->acc_context))->pCtx)); ERROR_CHECK_FAIL(cuMemFreeHost(buffers)); ERROR_CHECK_FAIL(cuCtxPopCurrent(&((gpu_context)(c->acc_context))->pCtx)); return GIB_SUC; }
static av_cold int cudascale_config_props(AVFilterLink *outlink) { AVFilterContext *ctx = outlink->src; AVFilterLink *inlink = outlink->src->inputs[0]; CUDAScaleContext *s = ctx->priv; AVHWFramesContext *frames_ctx = (AVHWFramesContext*)inlink->hw_frames_ctx->data; AVCUDADeviceContext *device_hwctx = frames_ctx->device_ctx->hwctx; CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; int w, h; int ret; extern char vf_scale_cuda_ptx[]; ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx)); if (ret < 0) goto fail; ret = CHECK_CU(cuModuleLoadData(&s->cu_module, vf_scale_cuda_ptx)); if (ret < 0) goto fail; CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar, s->cu_module, "Subsample_Bilinear_uchar")); CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar2, s->cu_module, "Subsample_Bilinear_uchar2")); CHECK_CU(cuModuleGetFunction(&s->cu_func_uchar4, s->cu_module, "Subsample_Bilinear_uchar4")); CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort, s->cu_module, "Subsample_Bilinear_ushort")); CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort2, s->cu_module, "Subsample_Bilinear_ushort2")); CHECK_CU(cuModuleGetFunction(&s->cu_func_ushort4, s->cu_module, "Subsample_Bilinear_ushort4")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar, s->cu_module, "uchar_tex")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar2, s->cu_module, "uchar2_tex")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_uchar4, s->cu_module, "uchar4_tex")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort, s->cu_module, "ushort_tex")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort2, s->cu_module, "ushort2_tex")); CHECK_CU(cuModuleGetTexRef(&s->cu_tex_ushort4, s->cu_module, "ushort4_tex")); CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar2, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFlags(s->cu_tex_uchar4, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort2, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFlags(s->cu_tex_ushort4, CU_TRSF_READ_AS_INTEGER)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar2, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_uchar4, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort2, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuTexRefSetFilterMode(s->cu_tex_ushort4, CU_TR_FILTER_MODE_LINEAR)); CHECK_CU(cuCtxPopCurrent(&dummy)); if ((ret = ff_scale_eval_dimensions(s, s->w_expr, s->h_expr, inlink, outlink, &w, &h)) < 0) goto fail; if (((int64_t)h * inlink->w) > INT_MAX || ((int64_t)w * inlink->h) > INT_MAX) av_log(ctx, AV_LOG_ERROR, "Rescaled value for width or height is too big.\n"); outlink->w = w; outlink->h = h; ret = init_processing_chain(ctx, inlink->w, inlink->h, w, h); if (ret < 0) return ret; av_log(ctx, AV_LOG_VERBOSE, "w:%d h:%d -> w:%d h:%d\n", inlink->w, inlink->h, outlink->w, outlink->h); if (inlink->sample_aspect_ratio.num) { outlink->sample_aspect_ratio = av_mul_q((AVRational){outlink->h*inlink->w, outlink->w*inlink->h}, inlink->sample_aspect_ratio); } else { outlink->sample_aspect_ratio = inlink->sample_aspect_ratio; } return 0; fail: return ret; }
void cuda_enter(cuda_context *ctx) { ASSERT_CTX(ctx); if (!ctx->enter) cuCtxPushCurrent(ctx->ctx); ctx->enter++; }
static int cuvid_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt) { CuvidContext *ctx = avctx->priv_data; AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data; AVCUDADeviceContext *device_hwctx = device_ctx->hwctx; CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; AVFrame *frame = data; CUVIDSOURCEDATAPACKET cupkt; AVPacket filter_packet = { 0 }; AVPacket filtered_packet = { 0 }; CUdeviceptr mapped_frame = 0; int ret = 0, eret = 0; if (ctx->bsf && avpkt->size) { if ((ret = av_packet_ref(&filter_packet, avpkt)) < 0) { av_log(avctx, AV_LOG_ERROR, "av_packet_ref failed\n"); return ret; } if ((ret = av_bsf_send_packet(ctx->bsf, &filter_packet)) < 0) { av_log(avctx, AV_LOG_ERROR, "av_bsf_send_packet failed\n"); av_packet_unref(&filter_packet); return ret; } if ((ret = av_bsf_receive_packet(ctx->bsf, &filtered_packet)) < 0) { av_log(avctx, AV_LOG_ERROR, "av_bsf_receive_packet failed\n"); return ret; } avpkt = &filtered_packet; } ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx)); if (ret < 0) { av_packet_unref(&filtered_packet); return ret; } memset(&cupkt, 0, sizeof(cupkt)); if (avpkt->size) { cupkt.payload_size = avpkt->size; cupkt.payload = avpkt->data; if (avpkt->pts != AV_NOPTS_VALUE) { cupkt.flags = CUVID_PKT_TIMESTAMP; if (avctx->pkt_timebase.num && avctx->pkt_timebase.den) cupkt.timestamp = av_rescale_q(avpkt->pts, avctx->pkt_timebase, (AVRational){1, 10000000}); else cupkt.timestamp = avpkt->pts; } } else { cupkt.flags = CUVID_PKT_ENDOFSTREAM; } ret = CHECK_CU(cuvidParseVideoData(ctx->cuparser, &cupkt)); av_packet_unref(&filtered_packet); if (ret < 0) { goto error; } // cuvidParseVideoData doesn't return an error just because stuff failed... if (ctx->internal_error) { av_log(avctx, AV_LOG_ERROR, "cuvid decode callback error\n"); ret = ctx->internal_error; goto error; } if (av_fifo_size(ctx->frame_queue)) { CUVIDPARSERDISPINFO dispinfo; CUVIDPROCPARAMS params; unsigned int pitch = 0; int offset = 0; int i; av_fifo_generic_read(ctx->frame_queue, &dispinfo, sizeof(CUVIDPARSERDISPINFO), NULL); memset(¶ms, 0, sizeof(params)); params.progressive_frame = dispinfo.progressive_frame; params.second_field = 0; params.top_field_first = dispinfo.top_field_first; ret = CHECK_CU(cuvidMapVideoFrame(ctx->cudecoder, dispinfo.picture_index, &mapped_frame, &pitch, ¶ms)); if (ret < 0) goto error; if (avctx->pix_fmt == AV_PIX_FMT_CUDA) { ret = av_hwframe_get_buffer(ctx->hwframe, frame, 0); if (ret < 0) { av_log(avctx, AV_LOG_ERROR, "av_hwframe_get_buffer failed\n"); goto error; } ret = ff_decode_frame_props(avctx, frame); if (ret < 0) { av_log(avctx, AV_LOG_ERROR, "ff_decode_frame_props failed\n"); goto error; } for (i = 0; i < 2; i++) { CUDA_MEMCPY2D cpy = { .srcMemoryType = CU_MEMORYTYPE_DEVICE, .dstMemoryType = CU_MEMORYTYPE_DEVICE, .srcDevice = mapped_frame, .dstDevice = (CUdeviceptr)frame->data[i], .srcPitch = pitch, .dstPitch = frame->linesize[i], .srcY = offset, .WidthInBytes = FFMIN(pitch, frame->linesize[i]), .Height = avctx->coded_height >> (i ? 1 : 0), }; ret = CHECK_CU(cuMemcpy2D(&cpy)); if (ret < 0) goto error; offset += avctx->coded_height; } } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {