// YV12/IYUV are both 4:2:0 planar formats (12bpc) // Luma, U, V chroma planar (12bpc), chroma is subsampled (w/2,h/2) void VideoEncoder::CopyYV12orIYUVFrame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock) { // Source is YV12/IYUV, this native format is converted to NV12 format by the video encoder // (1) luma copy setup CUDA_MEMCPY2D stCopyLuma; memset((void *)&stCopyLuma, 0, sizeof(stCopyLuma)); stCopyLuma.srcXInBytes = 0; stCopyLuma.srcY = 0; stCopyLuma.srcMemoryType = CU_MEMORYTYPE_HOST; stCopyLuma.srcHost = sFrameParams.picBuf; stCopyLuma.srcDevice = 0; stCopyLuma.srcArray = 0; stCopyLuma.srcPitch = sFrameParams.Width; stCopyLuma.dstXInBytes = 0; stCopyLuma.dstY = 0; stCopyLuma.dstMemoryType = CU_MEMORYTYPE_DEVICE; stCopyLuma.dstHost = 0; stCopyLuma.dstDevice = dptr_VideoFrame; stCopyLuma.dstArray = 0; stCopyLuma.dstPitch = m_pEncoderParams->nDeviceMemPitch; stCopyLuma.WidthInBytes = m_pEncoderParams->iInputSize[0]; stCopyLuma.Height = m_pEncoderParams->iInputSize[1]; // (2) chroma copy setup, U/V can be done together CUDA_MEMCPY2D stCopyChroma; memset((void *)&stCopyChroma, 0, sizeof(stCopyChroma)); stCopyChroma.srcXInBytes = 0; stCopyChroma.srcY = m_pEncoderParams->iInputSize[1]<<1; // U/V chroma offset stCopyChroma.srcMemoryType = CU_MEMORYTYPE_HOST; stCopyChroma.srcHost = sFrameParams.picBuf; stCopyChroma.srcDevice = 0; stCopyChroma.srcArray = 0; stCopyChroma.srcPitch = sFrameParams.Width>>1; // chroma is subsampled by 2 (but it has U/V are next to each other) stCopyChroma.dstXInBytes = 0; stCopyChroma.dstY = m_pEncoderParams->iInputSize[1]<<1; // chroma offset (srcY*srcPitch now points to the chroma planes) stCopyChroma.dstMemoryType = CU_MEMORYTYPE_DEVICE; stCopyChroma.dstHost = 0; stCopyChroma.dstDevice = dptr_VideoFrame; stCopyChroma.dstArray = 0; stCopyChroma.dstPitch = m_pEncoderParams->nDeviceMemPitch>>1; stCopyChroma.WidthInBytes = m_pEncoderParams->iInputSize[0]>>1; stCopyChroma.Height = m_pEncoderParams->iInputSize[1]; // U/V are sent together // Don't forget we need to lock/unlock between memcopies checkCudaErrors(cuvidCtxLock(ctxLock, 0)); checkCudaErrors(cuMemcpy2D(&stCopyLuma)); // Now DMA Luma checkCudaErrors(cuMemcpy2D(&stCopyChroma)); // Now DMA Chroma channels (UV side by side) checkCudaErrors(cuvidCtxUnlock(ctxLock, 0)); }
// UYVY/YUY2 are both 4:2:2 formats (16bpc) // Luma, U, V are interleaved, chroma is subsampled (w/2,h) void VideoEncoder::CopyUYVYorYUY2Frame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock) { // Source is YUVY/YUY2 4:2:2, the YUV data in a packed and interleaved // YUV Copy setup CUDA_MEMCPY2D stCopyYUV422; memset((void *)&stCopyYUV422, 0, sizeof(stCopyYUV422)); stCopyYUV422.srcXInBytes = 0; stCopyYUV422.srcY = 0; stCopyYUV422.srcMemoryType = CU_MEMORYTYPE_HOST; stCopyYUV422.srcHost = sFrameParams.picBuf; stCopyYUV422.srcDevice = 0; stCopyYUV422.srcArray = 0; stCopyYUV422.srcPitch = sFrameParams.Width * 2; stCopyYUV422.dstXInBytes = 0; stCopyYUV422.dstY = 0; stCopyYUV422.dstMemoryType = CU_MEMORYTYPE_DEVICE; stCopyYUV422.dstHost = 0; stCopyYUV422.dstDevice = dptr_VideoFrame; stCopyYUV422.dstArray = 0; stCopyYUV422.dstPitch = m_pEncoderParams->nDeviceMemPitch; stCopyYUV422.WidthInBytes = m_pEncoderParams->iInputSize[0]*2; stCopyYUV422.Height = m_pEncoderParams->iInputSize[1]; // Don't forget we need to lock/unlock between memcopies checkCudaErrors(cuvidCtxLock(ctxLock, 0)); checkCudaErrors(cuMemcpy2D(&stCopyYUV422)); // Now DMA Luma/Chroma checkCudaErrors(cuvidCtxUnlock(ctxLock, 0)); }
// NV12 is 4:2:0 format (12bpc) // Luma followed by U/V chroma interleaved (12bpc), chroma is subsampled (w/2,h/2) void VideoEncoder::CopyNV12Frame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock) { // Source is NV12 in pitch linear memory // Because we are assume input is NV12 (if we take input in the native format), the encoder handles NV12 as a native format in pitch linear memory // Luma/Chroma can be done in a single transfer CUDA_MEMCPY2D stCopyNV12; memset((void *)&stCopyNV12, 0, sizeof(stCopyNV12)); stCopyNV12.srcXInBytes = 0; stCopyNV12.srcY = 0; stCopyNV12.srcMemoryType = CU_MEMORYTYPE_HOST; stCopyNV12.srcHost = sFrameParams.picBuf; stCopyNV12.srcDevice = 0; stCopyNV12.srcArray = 0; stCopyNV12.srcPitch = sFrameParams.Width; stCopyNV12.dstXInBytes = 0; stCopyNV12.dstY = 0; stCopyNV12.dstMemoryType = CU_MEMORYTYPE_DEVICE; stCopyNV12.dstHost = 0; stCopyNV12.dstDevice = dptr_VideoFrame; stCopyNV12.dstArray = 0; stCopyNV12.dstPitch = m_pEncoderParams->nDeviceMemPitch; stCopyNV12.WidthInBytes = m_pEncoderParams->iInputSize[0]; stCopyNV12.Height =(m_pEncoderParams->iInputSize[1] * 3) >> 1; // Don't forget we need to lock/unlock between memcopies checkCudaErrors(cuvidCtxLock(ctxLock, 0)); checkCudaErrors(cuMemcpy2D(&stCopyNV12)); // Now DMA Luma/Chroma checkCudaErrors(cuvidCtxUnlock(ctxLock, 0)); }
NVENCSTATUS VideoEncoder::EncodeFrame(EncodeFrameConfig *pEncodeFrame, NV_ENC_PIC_STRUCT picType, bool bFlush) { NVENCSTATUS nvStatus = NV_ENC_SUCCESS; if (bFlush) { FlushEncoder(); return NV_ENC_SUCCESS; } assert(pEncodeFrame); EncodeBuffer *pEncodeBuffer = m_EncodeBufferQueue.GetAvailable(); if (!pEncodeBuffer) { pEncodeBuffer = m_EncodeBufferQueue.GetPending(); m_pNvHWEncoder->ProcessOutput(pEncodeBuffer); // UnMap the input buffer after frame done if (pEncodeBuffer->stInputBfr.hInputSurface) { nvStatus = m_pNvHWEncoder->NvEncUnmapInputResource(pEncodeBuffer->stInputBfr.hInputSurface); pEncodeBuffer->stInputBfr.hInputSurface = NULL; } pEncodeBuffer = m_EncodeBufferQueue.GetAvailable(); } // encode width and height unsigned int dwWidth = pEncodeBuffer->stInputBfr.dwWidth; unsigned int dwHeight = pEncodeBuffer->stInputBfr.dwHeight; // Here we copy from Host to Device Memory (CUDA) cuvidCtxLock(m_ctxLock, 0); assert(pEncodeFrame->width == dwWidth && pEncodeFrame->height == dwHeight); CUDA_MEMCPY2D memcpy2D = {0}; memcpy2D.srcMemoryType = CU_MEMORYTYPE_DEVICE; memcpy2D.srcDevice = pEncodeFrame->dptr; memcpy2D.srcPitch = pEncodeFrame->pitch; memcpy2D.dstMemoryType = CU_MEMORYTYPE_DEVICE; memcpy2D.dstDevice = (CUdeviceptr)pEncodeBuffer->stInputBfr.pNV12devPtr; memcpy2D.dstPitch = pEncodeBuffer->stInputBfr.uNV12Stride; memcpy2D.WidthInBytes = dwWidth; memcpy2D.Height = dwHeight*3/2; __cu(cuMemcpy2D(&memcpy2D)); cuvidCtxUnlock(m_ctxLock, 0); nvStatus = m_pNvHWEncoder->NvEncMapInputResource(pEncodeBuffer->stInputBfr.nvRegisteredResource, &pEncodeBuffer->stInputBfr.hInputSurface); if (nvStatus != NV_ENC_SUCCESS) { PRINTERR("Failed to Map input buffer %p\n", pEncodeBuffer->stInputBfr.hInputSurface); return nvStatus; } m_pNvHWEncoder->NvEncEncodeFrame(pEncodeBuffer, NULL, pEncodeFrame->width, pEncodeFrame->height, picType); m_iEncodedFrames++; return NV_ENC_SUCCESS; }
void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic) { /* determine format */ CUarray_format_enum format; size_t dsize = datatype_size(mem.data_type); size_t size = mem.memory_size(); switch(mem.data_type) { case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break; case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break; case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break; case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break; default: assert(0); return; } CUtexref texref = NULL; cuda_push_context(); cuda_assert(cuModuleGetTexRef(&texref, cuModule, name)) if(!texref) { cuda_pop_context(); return; } if(interpolation) { CUarray handle = NULL; CUDA_ARRAY_DESCRIPTOR desc; desc.Width = mem.data_width; desc.Height = mem.data_height; desc.Format = format; desc.NumChannels = mem.data_elements; cuda_assert(cuArrayCreate(&handle, &desc)) if(!handle) { cuda_pop_context(); return; } if(mem.data_height > 1) { CUDA_MEMCPY2D param; memset(¶m, 0, sizeof(param)); param.dstMemoryType = CU_MEMORYTYPE_ARRAY; param.dstArray = handle; param.srcMemoryType = CU_MEMORYTYPE_HOST; param.srcHost = (void*)mem.data_pointer; param.srcPitch = mem.data_width*dsize*mem.data_elements; param.WidthInBytes = param.srcPitch; param.Height = mem.data_height; cuda_assert(cuMemcpy2D(¶m)) } else
NVENCSTATUS CNvEncoderLowLatency::ConvertYUVToNV12(CUdeviceptr dNV12devPtr, int dstPitch, unsigned char *yuv[3], int width, int height, int maxWidth, int maxHeight) { CCudaAutoLock cuLock(m_cuContext); // copy luma CUDA_MEMCPY2D copyParam; memset(©Param, 0, sizeof(copyParam)); copyParam.dstMemoryType = CU_MEMORYTYPE_DEVICE; copyParam.dstDevice = dNV12devPtr; copyParam.dstPitch = dstPitch; copyParam.srcMemoryType = CU_MEMORYTYPE_HOST; copyParam.srcHost = yuv[0]; copyParam.srcPitch = width; copyParam.WidthInBytes = width; copyParam.Height = height; __cu(cuMemcpy2D(©Param)); // copy chroma __cu(cuMemcpyHtoD(m_ChromaDevPtr[0], yuv[1], width*height / 4)); __cu(cuMemcpyHtoD(m_ChromaDevPtr[1], yuv[2], width*height / 4)); #define BLOCK_X 32 #define BLOCK_Y 16 int chromaHeight = height / 2; int chromaWidth = width / 2; dim3 block(BLOCK_X, BLOCK_Y, 1); dim3 grid((chromaWidth + BLOCK_X - 1) / BLOCK_X, (chromaHeight + BLOCK_Y - 1) / BLOCK_Y, 1); #undef BLOCK_Y #undef BLOCK_X CUdeviceptr dNV12Chroma = (CUdeviceptr)((unsigned char*)dNV12devPtr + dstPitch*maxHeight); void *args[8] = { &m_ChromaDevPtr[0], &m_ChromaDevPtr[1], &dNV12Chroma, &chromaWidth, &chromaHeight, &chromaWidth, &chromaWidth, &dstPitch }; __cu(cuLaunchKernel(m_cuInterleaveUVFunction, grid.x, grid.y, grid.z, block.x, block.y, block.z, 0, NULL, args, NULL)); CUresult cuResult = cuStreamQuery(NULL); if (!((cuResult == CUDA_SUCCESS) || (cuResult == CUDA_ERROR_NOT_READY))) { return NV_ENC_ERR_GENERIC; } return NV_ENC_SUCCESS; }
void memory_t<CUDA>::copyTo(void *dest, const uintptr_t bytes, const uintptr_t offset){ const uintptr_t bytes_ = (bytes == 0) ? size : bytes; OCCA_CHECK((bytes_ + offset) <= size); if(!isTexture) OCCA_CUDA_CHECK("Memory: Copy To", cuMemcpyDtoH(dest, *((CUdeviceptr*) handle) + offset, bytes_) ); else{ if(textureInfo.dim == 1) OCCA_CUDA_CHECK("Texture Memory: Copy To", cuMemcpyAtoH(dest, ((CUDATextureData_t*) handle)->array, offset, bytes_) ); else{ CUDA_MEMCPY2D info; info.srcXInBytes = offset; info.srcY = 0; info.srcMemoryType = CU_MEMORYTYPE_ARRAY; info.srcArray = ((CUDATextureData_t*) handle)->array; info.dstXInBytes = 0; info.dstY = 0; info.dstMemoryType = CU_MEMORYTYPE_HOST; info.dstHost = dest; info.dstPitch = 0; info.WidthInBytes = textureInfo.w * textureInfo.bytesInEntry; info.Height = (bytes_ / info.WidthInBytes); cuMemcpy2D(&info); dev->finish(); } } }
static int cuvid_output_frame(AVCodecContext *avctx, AVFrame *frame) { CuvidContext *ctx = avctx->priv_data; AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data; AVCUDADeviceContext *device_hwctx = device_ctx->hwctx; CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; CUdeviceptr mapped_frame = 0; int ret = 0, eret = 0; av_log(avctx, AV_LOG_TRACE, "cuvid_output_frame\n"); if (ctx->decoder_flushing) { ret = cuvid_decode_packet(avctx, NULL); if (ret < 0 && ret != AVERROR_EOF) return ret; } ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx)); if (ret < 0) return ret; if (av_fifo_size(ctx->frame_queue)) { CuvidParsedFrame parsed_frame; CUVIDPROCPARAMS params; unsigned int pitch = 0; int offset = 0; int i; av_fifo_generic_read(ctx->frame_queue, &parsed_frame, sizeof(CuvidParsedFrame), NULL); memset(¶ms, 0, sizeof(params)); params.progressive_frame = parsed_frame.dispinfo.progressive_frame; params.second_field = parsed_frame.second_field; params.top_field_first = parsed_frame.dispinfo.top_field_first; ret = CHECK_CU(cuvidMapVideoFrame(ctx->cudecoder, parsed_frame.dispinfo.picture_index, &mapped_frame, &pitch, ¶ms)); if (ret < 0) goto error; if (avctx->pix_fmt == AV_PIX_FMT_CUDA) { ret = av_hwframe_get_buffer(ctx->hwframe, frame, 0); if (ret < 0) { av_log(avctx, AV_LOG_ERROR, "av_hwframe_get_buffer failed\n"); goto error; } ret = ff_decode_frame_props(avctx, frame); if (ret < 0) { av_log(avctx, AV_LOG_ERROR, "ff_decode_frame_props failed\n"); goto error; } for (i = 0; i < 2; i++) { CUDA_MEMCPY2D cpy = { .srcMemoryType = CU_MEMORYTYPE_DEVICE, .dstMemoryType = CU_MEMORYTYPE_DEVICE, .srcDevice = mapped_frame, .dstDevice = (CUdeviceptr)frame->data[i], .srcPitch = pitch, .dstPitch = frame->linesize[i], .srcY = offset, .WidthInBytes = FFMIN(pitch, frame->linesize[i]), .Height = avctx->height >> (i ? 1 : 0), }; ret = CHECK_CU(cuMemcpy2D(&cpy)); if (ret < 0) goto error; offset += avctx->coded_height; } } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
int main(int argc, char * argv[]) { CBlasTranspose transA, transB; size_t m, n, k; int d = 0; if (argc < 6 || argc > 7) { fprintf(stderr, "Usage: %s <transA> <transB> <m> <n> <k> [device]\n" "where:\n" " transA and transB are 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " m, n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char t; if (sscanf(argv[1], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (t) { case 'N': case 'n': transA = CBlasNoTrans; break; case 'T': case 't': transA = CBlasTrans; break; case 'C': case 'c': transA = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': transB = CBlasNoTrans; break; case 'T': case 't': transB = CBlasTrans; break; case 'C': case 'c': transB = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[3], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (sscanf(argv[5], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (argc > 6) { if (sscanf(argv[6], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } } srand(0); float complex alpha, beta, * A, * B, * C, * refC; CUdeviceptr dA, dB, dC, dD; size_t lda, ldb, ldc, dlda, dldb, dldc, dldd; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; beta = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; if (transA == CBlasNoTrans) { lda = (m + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(float complex), k, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), m * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * m * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(float complex), m, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), k * sizeof(float complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } if (transB == CBlasNoTrans) { ldb = (k + 1u) & ~1u; if ((B = malloc(ldb * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, k * sizeof(float complex), n, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), k * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { ldb = (n + 1u) & ~1u; if ((B = malloc(ldb * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, n * sizeof(float complex), k, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), n * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (m + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, m * sizeof(float complex), n, sizeof(float complex))); dldc /= sizeof(float complex); CU_ERROR_CHECK(cuMemAllocPitch(&dD, &dldd, m * sizeof(float complex), n, sizeof(float complex))); dldd /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refC[j * ldc + i] = C[j * ldc + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); cgemm_ref(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, refC, ldc); CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dD, NULL, dldd * sizeof(float complex), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); float rdiff = 0.0f, idiff = 0.0f; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { float d = fabsf(crealf(C[j * ldc + i]) - crealf(refC[j * ldc + i])); if (d > rdiff) rdiff = d; d = fabsf(cimagf(C[j * ldc + i]) - cimagf(refC[j * ldc + i])); if (d > idiff) idiff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = k * 6 + (k - 1) * 2; // k multiplies and k - 1 adds per element if (alpha != 1.0f + 0.0f * I) flops += 6; // additional multiply by alpha if (beta != 0.0f + 0.0f * I) flops += 8; // additional multiply and add by beta float error = (float)flops * 2.0f * FLT_EPSILON; // maximum per element error flops *= m * n; // m * n elements bool passed = (rdiff <= error) && (idiff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuMemFree(dD)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
int main(int argc, char * argv[]) { CBlasUplo uplo; CBlasTranspose trans; size_t n, k; int d = 0; if (argc < 5 || argc > 6) { fprintf(stderr, "Usage: %s <uplo> <trans> <n> <k> [device]\n" "where:\n" " uplo is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n" " trans are 'n' or 'N' for CBlasNoTrans or 't' or 'T' for CBlasTrans\n" " n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char u; if (sscanf(argv[1], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1; } char t; if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': trans = CBlasNoTrans; break; case 'T': case 't': trans = CBlasTrans; break; case 'C': case 'c': trans = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 2; } if (sscanf(argv[3], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (argc > 5) { if (sscanf(argv[5], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } } srand(0); double alpha, beta, * A, * C, * refC; CUdeviceptr dA, dC; size_t lda, ldc, dlda, dldc; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = (double)rand() / (double)RAND_MAX; beta = (double)rand() / (double)RAND_MAX; if (trans == CBlasNoTrans) { lda = (n + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), k, sizeof(double))); dlda /= sizeof(double); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(double), n, sizeof(double))); dlda /= sizeof(double); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), k * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (n + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(double))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(double))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, n * sizeof(double), n, sizeof(double))); dldc /= sizeof(double); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) refC[j * ldc + i] = C[j * ldc + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); dsyrk_ref(uplo, trans, n, k, alpha, A, lda, beta, refC, ldc); CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); double diff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) { double d = fabs(C[j * ldc + i] - refC[j * ldc + i]); if (d > diff) diff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = 2 * k - 1; // k multiplies and k - 1 adds per element if (alpha != 1.0) flops += 1; // additional multiply by alpha if (beta != 0.0) flops += 2; // additional multiply and add by beta double error = (double)flops * 2.0 * DBL_EPSILON; // maximum per element error flops *= n * (n + 1) / 2; // n(n + 1) / 2 elements bool passed = (diff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
int main(int argc, char * argv[]) { CBlasUplo uplo; size_t n; int d = 0; if (argc < 3 || argc > 4) { fprintf(stderr, "Usage: %s <uplo> <n>\n" "where:\n" " uplo is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n" " n is the size of the matrix\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char u; if (sscanf(argv[1], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1; } if (sscanf(argv[2], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[2]); return 2; } if (argc > 3) { if (sscanf(argv[3], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } } srand(0); double * A, * refA; CUdeviceptr dA; size_t lda, dlda; long info, rInfo; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CULAPACKhandle handle; CU_ERROR_CHECK(cuLAPACKCreate(&handle)); lda = (n + 1u) & ~1u; if ((A = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } if ((refA = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate refA\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), n, sizeof(double))); dlda /= sizeof(double); if (dlatmc(n, 2.0, A, lda) != 0) { fputs("Unable to initialise A\n", stderr); return -1; } // dpotrf(uplo, n, A, lda, &info); // if (info != 0) { // fputs("Failed to compute Cholesky decomposition of A\n", stderr); // return (int)info; // } for (size_t j = 0; j < n; j++) memcpy(&refA[j * lda], &A[j * lda], n * sizeof(double)); CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); dlauum_ref(uplo, n, refA, lda, &rInfo); CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); bool passed = (info == rInfo); double diff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) { double d = fabs(A[j * lda + i] - refA[j * lda + i]); if (d > diff) diff = d; } } // Set A to identity so that repeated applications of the cholesky // decomposition while benchmarking do not exit early due to // non-positive-definite-ness. for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (i == j) ? 1.0 : 0.0; } copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); const size_t flops = ((n * n * n) / 3) + ((n * n) / 2) + (n / 6); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(refA); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuLAPACKDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
bool EGLInteropResource::map(int picIndex, const CUVIDPROCPARAMS ¶m, GLuint tex, int w, int h, int H, int plane) { // plane is always 0 because frame is rgb AutoCtxLock locker((cuda_api*)this, lock); Q_UNUSED(locker); if (!ensureResource(w, h, param.Reserved[0], H, tex)) // TODO surface size instead of frame size because we copy the device data return false; //CUDA_ENSURE(cuCtxPushCurrent(ctx), false); CUdeviceptr devptr; unsigned int pitch; CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), false); CUVIDAutoUnmapper unmapper(this, dec, devptr); Q_UNUSED(unmapper); // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false); CUarray array; CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false); CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); // mapped array still accessible! CUDA_MEMCPY2D cu2d; memset(&cu2d, 0, sizeof(cu2d)); // Y plane cu2d.srcDevice = devptr; cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE; cu2d.srcPitch = pitch; cu2d.dstArray = array; cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY; cu2d.dstPitch = pitch; // the whole size or copy size? cu2d.WidthInBytes = res[plane].W; // the same value as texture9_nv12 cu2d.Height = H*3/2; if (res[plane].stream) CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false); else CUDA_ENSURE(cuMemcpy2D(&cu2d), false); //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)? // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error // so I simply unmap the resource here if (WORKAROUND_UNMAP_CONTEXT_SWITCH) { if (res[plane].stream) { //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery? CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize } /* * This function provides the synchronization guarantee that any CUDA work issued * in \p stream before ::cuGraphicsUnmapResources() will complete before any * subsequently issued graphics work begins. * The graphics API from which \p resources were registered * should not access any resources while they are mapped by CUDA. If an * application does so, the results are undefined. */ // CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); } D3DLOCKED_RECT rect_src, rect_dst; DX_ENSURE(texture9_nv12->LockRect(0, &rect_src, NULL, D3DLOCK_READONLY), false); DX_ENSURE(surface9_nv12->LockRect(&rect_dst, NULL, D3DLOCK_DISCARD), false); memcpy(rect_dst.pBits, rect_src.pBits, res[plane].W*H*3/2); // exactly w and h DX_ENSURE(surface9_nv12->UnlockRect(), false); DX_ENSURE(texture9_nv12->UnlockRect(0), false); #if 0 //IDirect3DSurface9 *raw_surface = NULL; //DX_ENSURE(texture9_nv12->GetSurfaceLevel(0, &raw_surface), false); const RECT src = { 0, 0, w, h*3/2}; DX_ENSURE(device9->StretchRect(raw_surface, &src, surface9_nv12, NULL, D3DTEXF_NONE), false); #endif if (!map(surface9_nv12, tex, w, h, H)) return false; return true; }
bool GLInteropResource::map(int picIndex, const CUVIDPROCPARAMS ¶m, GLuint tex, int w, int h, int H, int plane) { AutoCtxLock locker((cuda_api*)this, lock); Q_UNUSED(locker); if (!ensureResource(w, h, H, tex, plane)) // TODO surface size instead of frame size because we copy the device data return false; //CUDA_ENSURE(cuCtxPushCurrent(ctx), false); CUdeviceptr devptr; unsigned int pitch; CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(¶m)), false); CUVIDAutoUnmapper unmapper(this, dec, devptr); Q_UNUSED(unmapper); // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false); CUarray array; CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false); CUDA_MEMCPY2D cu2d; memset(&cu2d, 0, sizeof(cu2d)); cu2d.srcDevice = devptr; cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE; cu2d.srcPitch = pitch; cu2d.dstArray = array; cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY; cu2d.dstPitch = pitch; // the whole size or copy size? cu2d.WidthInBytes = pitch; cu2d.Height = h; if (plane == 1) { cu2d.srcXInBytes = 0;// +srcY*srcPitch + srcXInBytes cu2d.srcY = H; // skip the padding height cu2d.Height /= 2; } if (res[plane].stream) CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false); else CUDA_ENSURE(cuMemcpy2D(&cu2d), false); //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)? // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error // so I simply unmap the resource here if (WORKAROUND_UNMAP_CONTEXT_SWITCH) { if (res[plane].stream) { //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery? CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize } /* * This function provides the synchronization guarantee that any CUDA work issued * in \p stream before ::cuGraphicsUnmapResources() will complete before any * subsequently issued graphics work begins. * The graphics API from which \p resources were registered * should not access any resources while they are mapped by CUDA. If an * application does so, the results are undefined. */ CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); } else { // call it at last. current context will be used by other cuda calls (unmap() for example) CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // not required } return true; }
bool ResourceCUDA::updateResourceFromCUDA(CUstream streamID) { CUresult res; res = cuGraphicsMapResources( 1, &m_cudaResource, streamID ); if(res) { LOGE("Error>> CUDA failed map some target resources\n"); return false; } // // Walk through output resources and perform the copies // CUarray cuArray; //# define DBGDUMMYCOPY # ifdef DBGDUMMYCOPY // interop has issues... let's compare with a copy to a basic cuda array int www = m_xByteSz/4; CUDA_ARRAY_DESCRIPTOR descr = { www,//unsigned int Width; m_creationData.sz[1],//unsigned int Height; CU_AD_FORMAT_UNSIGNED_INT8,//CUarray_format Format; 4//unsigned int NumChannels; }; res = cuArrayCreate(&cuArray, &descr); # else res = cuGraphicsSubResourceGetMappedArray( &cuArray, m_cudaResource, 0/*arrayIndex*/, 0/*mipLevel*/); # endif if(res) { res = cuGraphicsUnmapResources( 1, &m_cudaResource, streamID ); return false; } CUDA_MEMCPY2D copyInfo = { 0, ///< Source X in bytes 0, ///< Source Y CU_MEMORYTYPE_DEVICE,//< Source memory type (host, device, array) NULL, ///< Source host pointer m_dptr, ///< Source device pointer NULL, ///< Source array reference m_pitch, ///< Source pitch (ignored when src is array) 0, ///< Destination X in bytes 0, ///< Destination Y CU_MEMORYTYPE_ARRAY,///< Destination memory type (host, device, array) NULL, ///< Destination host pointer NULL, ///< Destination device pointer cuArray, ///< Destination array reference 0, ///< Destination pitch (ignored when dst is array) m_xByteSz, ///< Width of 2D memory copy in bytes m_creationData.sz[1] ///< Height of 2D memory copy }; //LOGI("cuMemcpy2D(): CU_MEMORYTYPE_DEVICE source=%x pitch=%d CU_MEMORYTYPE_ARRAY=%x widthBytes=%d height=%d\n",m_dptr, m_pitch, cuArray, m_xByteSz, m_creationData.sz[1]); res = cuMemcpy2D( ©Info ); if(res) { LOGE("Error>> CUDA failed to copy linear memory to texture (array memory)\n"); res = cuGraphicsUnmapResources( 1, &m_cudaResource, streamID ); return false; } # ifdef DBGDUMMYCOPY res = cuArrayDestroy(cuArray); # endif res = cuGraphicsUnmapResources( 1, &m_cudaResource, streamID ); if(res) { LOGE("Error>> CUDA failed unmap the resource for output result of the kernel\n"); return false; } return true; }
int main(int argc, char * argv[]) { CBlasSide side; CBlasUplo uplo; CBlasTranspose trans; CBlasDiag diag; size_t m, n; int d = 0; if (argc < 7 || argc > 8) { fprintf(stderr, "Usage: %s <side> <uplo> <trans> <diag> <m> <n> [device]\n" "where:\n" " side is 'l' or 'L' for CBlasLeft and 'r' or 'R' for CBlasRight\n" " uplo is 'u' or 'U' for CBlasUpper and 'l' or 'L' for CBlasLower\n" " trans is 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " diag is 'n' or 'N' for CBlasNonUnit and 'u' or 'U' for CBlasUnit\n" " m and n are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char s; if (sscanf(argv[1], "%c", &s) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (s) { case 'L': case 'l': side = CBlasLeft; break; case 'R': case 'r': side = CBlasRight; break; default: fprintf(stderr, "Unknown side '%c'\n", s); return 1; } char u; if (sscanf(argv[2], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 2; } char t; if (sscanf(argv[3], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[3]); return 3; } switch (t) { case 'N': case 'n': trans = CBlasNoTrans; break; case 'T': case 't': trans = CBlasTrans; break; case 'C': case 'c': trans = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 3; } char di; if (sscanf(argv[4], "%c", &di) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[4]); return 4; } switch (di) { case 'N': case 'n': diag = CBlasNonUnit; break; case 'U': case 'u': diag = CBlasUnit; break; default: fprintf(stderr, "Unknown diag '%c'\n", t); return 4; } if (sscanf(argv[5], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (sscanf(argv[6], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } if (argc > 7) { if (sscanf(argv[7], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[7]); return 7; } } srand(0); double complex alpha, * A, * B, * refB; CUdeviceptr dA, dB, dX; size_t lda, ldb, dlda, dldb, dldx; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; if (side == CBlasLeft) { lda = m; if ((A = malloc(lda * m * sizeof(double complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(double complex), m, sizeof(double complex))); dlda /= sizeof(double complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex), m * sizeof(double complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = n; if ((A = malloc(lda * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double complex), n, sizeof(double complex))); dlda /= sizeof(double complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex), n * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldb = m; if ((B = malloc(ldb * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -3; } if ((refB = malloc(ldb * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate refB\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, m * sizeof(double complex), n, sizeof(double complex))); dldb /= sizeof(double complex); CU_ERROR_CHECK(cuMemAllocPitch(&dX, &dldx, m * sizeof(double complex), n, sizeof(double complex))); dldx /= sizeof(double complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refB[j * ldb + i] = B[j * ldb + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(double complex), m * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); ztrmm_ref(side, uplo, trans, diag, m, n, alpha, A, lda, refB, ldb); CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dX, NULL, dldx * sizeof(double complex), 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex), m * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); bool passed = true; double rdiff = 0.0, idiff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { double d = fabs(creal(B[j * ldb + i]) - creal(refB[j * ldb + i])); if (d > rdiff) rdiff = d; double c = fabs(cimag(B[j * ldb + i]) - cimag(refB[j * ldb + i])); if (c > idiff) idiff = c; size_t flops; if (side == CBlasLeft) flops = 2 * i + 1; else flops = 2 * j + 1; if (diag == CBlasNonUnit) flops++; flops *= 3; if (d > (double)flops * 2.0 * DBL_EPSILON || c > (double)flops * 2.0 * DBL_EPSILON) passed = false; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); const size_t flops = (side == CBlasLeft) ? (6 * (n * m * (m + 1) / 2) + 2 * (n * m * (m - 1) / 2)) : (6 * (m * n * (n + 1) / 2) + 2 * (m * n * (n - 1) / 2)); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(refB); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dX)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
static int cuvid_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt) { CuvidContext *ctx = avctx->priv_data; AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data; AVCUDADeviceContext *device_hwctx = device_ctx->hwctx; CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx; AVFrame *frame = data; CUVIDSOURCEDATAPACKET cupkt; AVPacket filter_packet = { 0 }; AVPacket filtered_packet = { 0 }; CUdeviceptr mapped_frame = 0; int ret = 0, eret = 0; if (ctx->bsf && avpkt->size) { if ((ret = av_packet_ref(&filter_packet, avpkt)) < 0) { av_log(avctx, AV_LOG_ERROR, "av_packet_ref failed\n"); return ret; } if ((ret = av_bsf_send_packet(ctx->bsf, &filter_packet)) < 0) { av_log(avctx, AV_LOG_ERROR, "av_bsf_send_packet failed\n"); av_packet_unref(&filter_packet); return ret; } if ((ret = av_bsf_receive_packet(ctx->bsf, &filtered_packet)) < 0) { av_log(avctx, AV_LOG_ERROR, "av_bsf_receive_packet failed\n"); return ret; } avpkt = &filtered_packet; } ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx)); if (ret < 0) { av_packet_unref(&filtered_packet); return ret; } memset(&cupkt, 0, sizeof(cupkt)); if (avpkt->size) { cupkt.payload_size = avpkt->size; cupkt.payload = avpkt->data; if (avpkt->pts != AV_NOPTS_VALUE) { cupkt.flags = CUVID_PKT_TIMESTAMP; if (avctx->pkt_timebase.num && avctx->pkt_timebase.den) cupkt.timestamp = av_rescale_q(avpkt->pts, avctx->pkt_timebase, (AVRational){1, 10000000}); else cupkt.timestamp = avpkt->pts; } } else { cupkt.flags = CUVID_PKT_ENDOFSTREAM; } ret = CHECK_CU(cuvidParseVideoData(ctx->cuparser, &cupkt)); av_packet_unref(&filtered_packet); if (ret < 0) { goto error; } // cuvidParseVideoData doesn't return an error just because stuff failed... if (ctx->internal_error) { av_log(avctx, AV_LOG_ERROR, "cuvid decode callback error\n"); ret = ctx->internal_error; goto error; } if (av_fifo_size(ctx->frame_queue)) { CUVIDPARSERDISPINFO dispinfo; CUVIDPROCPARAMS params; unsigned int pitch = 0; int offset = 0; int i; av_fifo_generic_read(ctx->frame_queue, &dispinfo, sizeof(CUVIDPARSERDISPINFO), NULL); memset(¶ms, 0, sizeof(params)); params.progressive_frame = dispinfo.progressive_frame; params.second_field = 0; params.top_field_first = dispinfo.top_field_first; ret = CHECK_CU(cuvidMapVideoFrame(ctx->cudecoder, dispinfo.picture_index, &mapped_frame, &pitch, ¶ms)); if (ret < 0) goto error; if (avctx->pix_fmt == AV_PIX_FMT_CUDA) { ret = av_hwframe_get_buffer(ctx->hwframe, frame, 0); if (ret < 0) { av_log(avctx, AV_LOG_ERROR, "av_hwframe_get_buffer failed\n"); goto error; } ret = ff_decode_frame_props(avctx, frame); if (ret < 0) { av_log(avctx, AV_LOG_ERROR, "ff_decode_frame_props failed\n"); goto error; } for (i = 0; i < 2; i++) { CUDA_MEMCPY2D cpy = { .srcMemoryType = CU_MEMORYTYPE_DEVICE, .dstMemoryType = CU_MEMORYTYPE_DEVICE, .srcDevice = mapped_frame, .dstDevice = (CUdeviceptr)frame->data[i], .srcPitch = pitch, .dstPitch = frame->linesize[i], .srcY = offset, .WidthInBytes = FFMIN(pitch, frame->linesize[i]), .Height = avctx->coded_height >> (i ? 1 : 0), }; ret = CHECK_CU(cuMemcpy2D(&cpy)); if (ret < 0) goto error; offset += avctx->coded_height; } } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
CUresult I_WRAP_SONAME_FNNAME_ZZ(libcudaZdsoZa, cuMemcpy2DAsync)(const CUDA_MEMCPY2D *pCopy, CUstream hStream) { int error = 0; long vgErrorAddress; vgErrorAddress = VALGRIND_CHECK_MEM_IS_DEFINED(&hStream, sizeof(CUstream)); if (vgErrorAddress) { error++; VALGRIND_PRINTF("Error: 'hStream' in call to cuMemcpy2DAsync not defined.\n"); } cgLock(); CUcontext ctx = NULL; cgGetCtx(&ctx); // Check if destination (device) memory/array is already being written to. switch (pCopy->dstMemoryType) { case CU_MEMORYTYPE_DEVICE: { cgMemListType *nodeMem; nodeMem = cgFindMem(cgFindCtx(ctx), pCopy->dstDevice); if (nodeMem) { // Are we trying to read a memory region that's being written by diffrent stream? if (nodeMem->locked & 2 && nodeMem->stream != hStream) { error++; VALGRIND_PRINTF("Error: Concurrent write and read access by different streams.\n"); } nodeMem->locked = nodeMem->locked | 1; nodeMem->stream = hStream; } break; } case CU_MEMORYTYPE_ARRAY: { cgArrListType *nodeArr; nodeArr = cgFindArr(cgFindCtx(ctx), pCopy->dstArray); if (nodeArr) { // Are we trying to read an array that's being written by different stream? if (nodeArr->locked & 2 && nodeArr->stream != hStream) { error++; VALGRIND_PRINTF("Error: Concurrent write and read access to array by different streams.\n"); } nodeArr->locked = nodeArr->locked | 1; nodeArr->stream = hStream; } break; } } // Check if source (device) memory/array is already being written to/read from. switch (pCopy->srcMemoryType) { case CU_MEMORYTYPE_DEVICE: { cgMemListType *nodeMem; nodeMem = cgFindMem(cgFindCtx(ctx), pCopy->srcDevice); if (nodeMem) { // Are we trying to read a memory region that's being written by diffrent stream? if (nodeMem->locked && nodeMem->stream != hStream) { error++; VALGRIND_PRINTF("Error: Concurrent write and read access by different streams.\n"); } nodeMem->locked = nodeMem->locked | 2; nodeMem->stream = hStream; } break; } case CU_MEMORYTYPE_ARRAY: { cgArrListType *nodeArr; nodeArr = cgFindArr(cgFindCtx(ctx), pCopy->srcArray); if (nodeArr) { // Are we trying to read an array that's being written by different stream? if (nodeArr->locked && nodeArr->stream != hStream) { error++; VALGRIND_PRINTF("Error: Concurrent write and read access to array by different streams.\n"); } nodeArr->locked = nodeArr->locked | 2; nodeArr->stream = hStream; } break; } } cgUnlock(); if (error) { VALGRIND_PRINTF_BACKTRACE(""); } return cuMemcpy2D(pCopy); }
int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); for (int i = 0; i < count; i++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, i)); int memoryClockRate, globalMemoryBusWidth; CU_ERROR_CHECK(cuDeviceGetAttribute(&memoryClockRate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); CU_ERROR_CHECK(cuDeviceGetAttribute(&globalMemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); // Calculate pin bandwidth in bytes/sec (clock rate is actual in kHz, memory is DDR so multiply clock rate by 2.e3 to get effective clock rate in Hz) double pinBandwidth = memoryClockRate * 2.e3 * (globalMemoryBusWidth / CHAR_BIT); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); fprintf(stdout, "Device %d (pin bandwidth %6.2f GB/s):\n", i, pinBandwidth / (1 << 30)); CUDA_MEMCPY2D copy; copy.srcMemoryType = CU_MEMORYTYPE_DEVICE; copy.dstMemoryType = CU_MEMORYTYPE_DEVICE; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_DEFAULT)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_DEFAULT)); float time; // Calculate aligned copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\taligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate misaligned copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE; copy.Height = 1; copy.srcXInBytes = j; copy.srcY = 0; copy.dstXInBytes = j; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t j = 0; j < ITERATIONS; j++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tmisaligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-2 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 2; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 2; copy.dstPitch *= 2; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-2 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-10 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 10; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 10; copy.dstPitch *= 10; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-10 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-1000 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 1000; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 1000; copy.dstPitch *= 1000; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t j = 0; j < ITERATIONS; j++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-1000 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }