// YV12/IYUV are both 4:2:0 planar formats (12bpc)
// Luma, U, V chroma planar (12bpc), chroma is subsampled (w/2,h/2)
void
VideoEncoder::CopyYV12orIYUVFrame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock)
{
    // Source is YV12/IYUV, this native format is converted to NV12 format by the video encoder
    // (1) luma copy setup
    CUDA_MEMCPY2D stCopyLuma;
    memset((void *)&stCopyLuma, 0, sizeof(stCopyLuma));
    stCopyLuma.srcXInBytes          = 0;
    stCopyLuma.srcY                 = 0;
    stCopyLuma.srcMemoryType        = CU_MEMORYTYPE_HOST;
    stCopyLuma.srcHost              = sFrameParams.picBuf;
    stCopyLuma.srcDevice            = 0;
    stCopyLuma.srcArray             = 0;
    stCopyLuma.srcPitch             = sFrameParams.Width;

    stCopyLuma.dstXInBytes          = 0;
    stCopyLuma.dstY                 = 0;
    stCopyLuma.dstMemoryType        = CU_MEMORYTYPE_DEVICE;
    stCopyLuma.dstHost              = 0;
    stCopyLuma.dstDevice            = dptr_VideoFrame;
    stCopyLuma.dstArray             = 0;
    stCopyLuma.dstPitch             = m_pEncoderParams->nDeviceMemPitch;

    stCopyLuma.WidthInBytes         = m_pEncoderParams->iInputSize[0];
    stCopyLuma.Height               = m_pEncoderParams->iInputSize[1];

    // (2) chroma copy setup, U/V can be done together
    CUDA_MEMCPY2D stCopyChroma;
    memset((void *)&stCopyChroma, 0, sizeof(stCopyChroma));
    stCopyChroma.srcXInBytes        = 0;
    stCopyChroma.srcY               = m_pEncoderParams->iInputSize[1]<<1; // U/V chroma offset
    stCopyChroma.srcMemoryType      = CU_MEMORYTYPE_HOST;
    stCopyChroma.srcHost            = sFrameParams.picBuf;
    stCopyChroma.srcDevice          = 0;
    stCopyChroma.srcArray           = 0;
    stCopyChroma.srcPitch           = sFrameParams.Width>>1; // chroma is subsampled by 2 (but it has U/V are next to each other)

    stCopyChroma.dstXInBytes        = 0;
    stCopyChroma.dstY               = m_pEncoderParams->iInputSize[1]<<1; // chroma offset (srcY*srcPitch now points to the chroma planes)
    stCopyChroma.dstMemoryType      = CU_MEMORYTYPE_DEVICE;
    stCopyChroma.dstHost            = 0;
    stCopyChroma.dstDevice          = dptr_VideoFrame;
    stCopyChroma.dstArray           = 0;
    stCopyChroma.dstPitch           = m_pEncoderParams->nDeviceMemPitch>>1;

    stCopyChroma.WidthInBytes       = m_pEncoderParams->iInputSize[0]>>1;
    stCopyChroma.Height             = m_pEncoderParams->iInputSize[1]; // U/V are sent together

    // Don't forget we need to lock/unlock between memcopies
    checkCudaErrors(cuvidCtxLock(ctxLock, 0));
    checkCudaErrors(cuMemcpy2D(&stCopyLuma));       // Now DMA Luma
    checkCudaErrors(cuMemcpy2D(&stCopyChroma));     // Now DMA Chroma channels (UV side by side)
    checkCudaErrors(cuvidCtxUnlock(ctxLock, 0));
}
// UYVY/YUY2 are both 4:2:2 formats (16bpc)
// Luma, U, V are interleaved, chroma is subsampled (w/2,h)
void
VideoEncoder::CopyUYVYorYUY2Frame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock)
{
    // Source is YUVY/YUY2 4:2:2, the YUV data in a packed and interleaved
    // YUV Copy setup
    CUDA_MEMCPY2D stCopyYUV422;
    memset((void *)&stCopyYUV422, 0, sizeof(stCopyYUV422));
    stCopyYUV422.srcXInBytes          = 0;
    stCopyYUV422.srcY                 = 0;
    stCopyYUV422.srcMemoryType        = CU_MEMORYTYPE_HOST;
    stCopyYUV422.srcHost              = sFrameParams.picBuf;
    stCopyYUV422.srcDevice            = 0;
    stCopyYUV422.srcArray             = 0;
    stCopyYUV422.srcPitch             = sFrameParams.Width * 2;

    stCopyYUV422.dstXInBytes          = 0;
    stCopyYUV422.dstY                 = 0;
    stCopyYUV422.dstMemoryType        = CU_MEMORYTYPE_DEVICE;
    stCopyYUV422.dstHost              = 0;
    stCopyYUV422.dstDevice            = dptr_VideoFrame;
    stCopyYUV422.dstArray             = 0;
    stCopyYUV422.dstPitch             = m_pEncoderParams->nDeviceMemPitch;

    stCopyYUV422.WidthInBytes         = m_pEncoderParams->iInputSize[0]*2;
    stCopyYUV422.Height               = m_pEncoderParams->iInputSize[1];

    // Don't forget we need to lock/unlock between memcopies
    checkCudaErrors(cuvidCtxLock(ctxLock, 0));
    checkCudaErrors(cuMemcpy2D(&stCopyYUV422));     // Now DMA Luma/Chroma
    checkCudaErrors(cuvidCtxUnlock(ctxLock, 0));
}
// NV12 is 4:2:0 format (12bpc)
// Luma followed by U/V chroma interleaved (12bpc), chroma is subsampled (w/2,h/2)
void
VideoEncoder::CopyNV12Frame(NVVE_EncodeFrameParams &sFrameParams, CUdeviceptr dptr_VideoFrame, CUvideoctxlock ctxLock)
{
    // Source is NV12 in pitch linear memory
    // Because we are assume input is NV12 (if we take input in the native format), the encoder handles NV12 as a native format in pitch linear memory
    // Luma/Chroma can be done in a single transfer
    CUDA_MEMCPY2D stCopyNV12;
    memset((void *)&stCopyNV12, 0, sizeof(stCopyNV12));
    stCopyNV12.srcXInBytes          = 0;
    stCopyNV12.srcY                 = 0;
    stCopyNV12.srcMemoryType        = CU_MEMORYTYPE_HOST;
    stCopyNV12.srcHost              = sFrameParams.picBuf;
    stCopyNV12.srcDevice            = 0;
    stCopyNV12.srcArray             = 0;
    stCopyNV12.srcPitch             = sFrameParams.Width;

    stCopyNV12.dstXInBytes          = 0;
    stCopyNV12.dstY                 = 0;
    stCopyNV12.dstMemoryType        = CU_MEMORYTYPE_DEVICE;
    stCopyNV12.dstHost              = 0;
    stCopyNV12.dstDevice            = dptr_VideoFrame;
    stCopyNV12.dstArray             = 0;
    stCopyNV12.dstPitch             = m_pEncoderParams->nDeviceMemPitch;

    stCopyNV12.WidthInBytes         = m_pEncoderParams->iInputSize[0];
    stCopyNV12.Height               =(m_pEncoderParams->iInputSize[1] * 3) >> 1;

    // Don't forget we need to lock/unlock between memcopies
    checkCudaErrors(cuvidCtxLock(ctxLock, 0));
    checkCudaErrors(cuMemcpy2D(&stCopyNV12));    // Now DMA Luma/Chroma
    checkCudaErrors(cuvidCtxUnlock(ctxLock, 0));
}
NVENCSTATUS VideoEncoder::EncodeFrame(EncodeFrameConfig *pEncodeFrame, NV_ENC_PIC_STRUCT picType, bool bFlush)
{
    NVENCSTATUS nvStatus = NV_ENC_SUCCESS;

    if (bFlush)
    {
        FlushEncoder();
        return NV_ENC_SUCCESS;
    }

    assert(pEncodeFrame);

    EncodeBuffer *pEncodeBuffer = m_EncodeBufferQueue.GetAvailable();
    if (!pEncodeBuffer)
    {
        pEncodeBuffer = m_EncodeBufferQueue.GetPending();
        m_pNvHWEncoder->ProcessOutput(pEncodeBuffer);
        // UnMap the input buffer after frame done
        if (pEncodeBuffer->stInputBfr.hInputSurface)
        {
            nvStatus = m_pNvHWEncoder->NvEncUnmapInputResource(pEncodeBuffer->stInputBfr.hInputSurface);
            pEncodeBuffer->stInputBfr.hInputSurface = NULL;
        }
        pEncodeBuffer = m_EncodeBufferQueue.GetAvailable();
    }

    // encode width and height
    unsigned int dwWidth  = pEncodeBuffer->stInputBfr.dwWidth;
    unsigned int dwHeight = pEncodeBuffer->stInputBfr.dwHeight;

    // Here we copy from Host to Device Memory (CUDA)
    cuvidCtxLock(m_ctxLock, 0);
    assert(pEncodeFrame->width == dwWidth && pEncodeFrame->height == dwHeight);

    CUDA_MEMCPY2D memcpy2D  = {0};
    memcpy2D.srcMemoryType  = CU_MEMORYTYPE_DEVICE;
    memcpy2D.srcDevice      = pEncodeFrame->dptr;
    memcpy2D.srcPitch       = pEncodeFrame->pitch;
    memcpy2D.dstMemoryType  = CU_MEMORYTYPE_DEVICE;
    memcpy2D.dstDevice      = (CUdeviceptr)pEncodeBuffer->stInputBfr.pNV12devPtr;
    memcpy2D.dstPitch       = pEncodeBuffer->stInputBfr.uNV12Stride;
    memcpy2D.WidthInBytes   = dwWidth;
    memcpy2D.Height         = dwHeight*3/2;
    __cu(cuMemcpy2D(&memcpy2D));

    cuvidCtxUnlock(m_ctxLock, 0);

    nvStatus = m_pNvHWEncoder->NvEncMapInputResource(pEncodeBuffer->stInputBfr.nvRegisteredResource, &pEncodeBuffer->stInputBfr.hInputSurface);
    if (nvStatus != NV_ENC_SUCCESS)
    {
        PRINTERR("Failed to Map input buffer %p\n", pEncodeBuffer->stInputBfr.hInputSurface);
        return nvStatus;
    }

    m_pNvHWEncoder->NvEncEncodeFrame(pEncodeBuffer, NULL, pEncodeFrame->width, pEncodeFrame->height, picType);
    m_iEncodedFrames++;

    return NV_ENC_SUCCESS;
}
Exemple #5
0
	void tex_alloc(const char *name, device_memory& mem, bool interpolation, bool periodic)
	{
		/* determine format */
		CUarray_format_enum format;
		size_t dsize = datatype_size(mem.data_type);
		size_t size = mem.memory_size();

		switch(mem.data_type) {
			case TYPE_UCHAR: format = CU_AD_FORMAT_UNSIGNED_INT8; break;
			case TYPE_UINT: format = CU_AD_FORMAT_UNSIGNED_INT32; break;
			case TYPE_INT: format = CU_AD_FORMAT_SIGNED_INT32; break;
			case TYPE_FLOAT: format = CU_AD_FORMAT_FLOAT; break;
			default: assert(0); return;
		}

		CUtexref texref = NULL;

		cuda_push_context();
		cuda_assert(cuModuleGetTexRef(&texref, cuModule, name))

		if(!texref) {
			cuda_pop_context();
			return;
		}

		if(interpolation) {
			CUarray handle = NULL;
			CUDA_ARRAY_DESCRIPTOR desc;

			desc.Width = mem.data_width;
			desc.Height = mem.data_height;
			desc.Format = format;
			desc.NumChannels = mem.data_elements;

			cuda_assert(cuArrayCreate(&handle, &desc))

			if(!handle) {
				cuda_pop_context();
				return;
			}

			if(mem.data_height > 1) {
				CUDA_MEMCPY2D param;
				memset(&param, 0, sizeof(param));
				param.dstMemoryType = CU_MEMORYTYPE_ARRAY;
				param.dstArray = handle;
				param.srcMemoryType = CU_MEMORYTYPE_HOST;
				param.srcHost = (void*)mem.data_pointer;
				param.srcPitch = mem.data_width*dsize*mem.data_elements;
				param.WidthInBytes = param.srcPitch;
				param.Height = mem.data_height;

				cuda_assert(cuMemcpy2D(&param))
			}
			else
NVENCSTATUS CNvEncoderLowLatency::ConvertYUVToNV12(CUdeviceptr dNV12devPtr, int dstPitch, unsigned char *yuv[3],
                                                    int width, int height, int maxWidth, int maxHeight)
{
    CCudaAutoLock cuLock(m_cuContext);
    // copy luma
    CUDA_MEMCPY2D copyParam;
    memset(&copyParam, 0, sizeof(copyParam));
    copyParam.dstMemoryType = CU_MEMORYTYPE_DEVICE;
    copyParam.dstDevice = dNV12devPtr;
    copyParam.dstPitch = dstPitch;
    copyParam.srcMemoryType = CU_MEMORYTYPE_HOST;
    copyParam.srcHost = yuv[0];
    copyParam.srcPitch = width;
    copyParam.WidthInBytes = width;
    copyParam.Height = height;
    __cu(cuMemcpy2D(&copyParam));

    // copy chroma

    __cu(cuMemcpyHtoD(m_ChromaDevPtr[0], yuv[1], width*height / 4));
    __cu(cuMemcpyHtoD(m_ChromaDevPtr[1], yuv[2], width*height / 4));

#define BLOCK_X 32
#define BLOCK_Y 16
    int chromaHeight = height / 2;
    int chromaWidth = width / 2;
    dim3 block(BLOCK_X, BLOCK_Y, 1);
    dim3 grid((chromaWidth + BLOCK_X - 1) / BLOCK_X, (chromaHeight + BLOCK_Y - 1) / BLOCK_Y, 1);
#undef BLOCK_Y
#undef BLOCK_X

    CUdeviceptr dNV12Chroma = (CUdeviceptr)((unsigned char*)dNV12devPtr + dstPitch*maxHeight);
    void *args[8] = { &m_ChromaDevPtr[0], &m_ChromaDevPtr[1], &dNV12Chroma, &chromaWidth, &chromaHeight, &chromaWidth, &chromaWidth, &dstPitch };

    __cu(cuLaunchKernel(m_cuInterleaveUVFunction, grid.x, grid.y, grid.z,
        block.x, block.y, block.z,
        0,
        NULL, args, NULL));
    CUresult cuResult = cuStreamQuery(NULL);
    if (!((cuResult == CUDA_SUCCESS) || (cuResult == CUDA_ERROR_NOT_READY)))
    {
        return NV_ENC_ERR_GENERIC;
    }
    return NV_ENC_SUCCESS;
}
Exemple #7
0
  void memory_t<CUDA>::copyTo(void *dest,
                              const uintptr_t bytes,
                              const uintptr_t offset){
    const uintptr_t bytes_ = (bytes == 0) ? size : bytes;

    OCCA_CHECK((bytes_ + offset) <= size);

    if(!isTexture)
      OCCA_CUDA_CHECK("Memory: Copy To",
                      cuMemcpyDtoH(dest, *((CUdeviceptr*) handle) + offset, bytes_) );
    else{
      if(textureInfo.dim == 1)
        OCCA_CUDA_CHECK("Texture Memory: Copy To",
                        cuMemcpyAtoH(dest, ((CUDATextureData_t*) handle)->array, offset, bytes_) );
      else{
        CUDA_MEMCPY2D info;

        info.srcXInBytes   = offset;
        info.srcY          = 0;
        info.srcMemoryType = CU_MEMORYTYPE_ARRAY;
        info.srcArray      = ((CUDATextureData_t*) handle)->array;

        info.dstXInBytes   = 0;
        info.dstY          = 0;
        info.dstMemoryType = CU_MEMORYTYPE_HOST;
        info.dstHost       = dest;
        info.dstPitch      = 0;

        info.WidthInBytes = textureInfo.w * textureInfo.bytesInEntry;
        info.Height       = (bytes_ / info.WidthInBytes);

        cuMemcpy2D(&info);

        dev->finish();
      }
    }
  }
Exemple #8
0
static int cuvid_output_frame(AVCodecContext *avctx, AVFrame *frame)
{
    CuvidContext *ctx = avctx->priv_data;
    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data;
    AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
    CUdeviceptr mapped_frame = 0;
    int ret = 0, eret = 0;

    av_log(avctx, AV_LOG_TRACE, "cuvid_output_frame\n");

    if (ctx->decoder_flushing) {
        ret = cuvid_decode_packet(avctx, NULL);
        if (ret < 0 && ret != AVERROR_EOF)
            return ret;
    }

    ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx));
    if (ret < 0)
        return ret;

    if (av_fifo_size(ctx->frame_queue)) {
        CuvidParsedFrame parsed_frame;
        CUVIDPROCPARAMS params;
        unsigned int pitch = 0;
        int offset = 0;
        int i;

        av_fifo_generic_read(ctx->frame_queue, &parsed_frame, sizeof(CuvidParsedFrame), NULL);

        memset(&params, 0, sizeof(params));
        params.progressive_frame = parsed_frame.dispinfo.progressive_frame;
        params.second_field = parsed_frame.second_field;
        params.top_field_first = parsed_frame.dispinfo.top_field_first;

        ret = CHECK_CU(cuvidMapVideoFrame(ctx->cudecoder, parsed_frame.dispinfo.picture_index, &mapped_frame, &pitch, &params));
        if (ret < 0)
            goto error;

        if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
            ret = av_hwframe_get_buffer(ctx->hwframe, frame, 0);
            if (ret < 0) {
                av_log(avctx, AV_LOG_ERROR, "av_hwframe_get_buffer failed\n");
                goto error;
            }

            ret = ff_decode_frame_props(avctx, frame);
            if (ret < 0) {
                av_log(avctx, AV_LOG_ERROR, "ff_decode_frame_props failed\n");
                goto error;
            }

            for (i = 0; i < 2; i++) {
                CUDA_MEMCPY2D cpy = {
                    .srcMemoryType = CU_MEMORYTYPE_DEVICE,
                    .dstMemoryType = CU_MEMORYTYPE_DEVICE,
                    .srcDevice     = mapped_frame,
                    .dstDevice     = (CUdeviceptr)frame->data[i],
                    .srcPitch      = pitch,
                    .dstPitch      = frame->linesize[i],
                    .srcY          = offset,
                    .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
                    .Height        = avctx->height >> (i ? 1 : 0),
                };

                ret = CHECK_CU(cuMemcpy2D(&cpy));
                if (ret < 0)
                    goto error;

                offset += avctx->coded_height;
            }
        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
int main(int argc, char * argv[]) {
  CBlasTranspose transA, transB;
  size_t m, n, k;
  int d = 0;

  if (argc < 6 || argc > 7) {
    fprintf(stderr, "Usage: %s <transA> <transB> <m> <n> <k> [device]\n"
                    "where:\n"
                    "  transA and transB  are 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n"
                    "  m, n and k         are the sizes of the matrices\n"
                    "  device             is the GPU to use (default 0)\n", argv[0]);
    return 1;
  }

  char t;
  if (sscanf(argv[1], "%c", &t) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[1]);
    return 1;
  }
  switch (t) {
    case 'N': case 'n': transA = CBlasNoTrans; break;
    case 'T': case 't': transA = CBlasTrans; break;
    case 'C': case 'c': transA = CBlasConjTrans; break;
    default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1;
  }

  if (sscanf(argv[2], "%c", &t) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[2]);
    return 2;
  }
  switch (t) {
    case 'N': case 'n': transB = CBlasNoTrans; break;
    case 'T': case 't': transB = CBlasTrans; break;
    case 'C': case 'c': transB = CBlasConjTrans; break;
    default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1;
  }

  if (sscanf(argv[3], "%zu", &m) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]);
    return 3;
  }

  if (sscanf(argv[4], "%zu", &n) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]);
    return 4;
  }

  if (sscanf(argv[5], "%zu", &k) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]);
    return 5;
  }

  if (argc > 6) {
    if (sscanf(argv[6], "%d", &d) != 1) {
      fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]);
      return 6;
    }
  }

  srand(0);

  float complex alpha, beta, * A, * B, * C, * refC;
  CUdeviceptr dA, dB, dC, dD;
  size_t lda, ldb, ldc, dlda, dldb, dldc, dldd;

  CU_ERROR_CHECK(cuInit(0));

  CUdevice device;
  CU_ERROR_CHECK(cuDeviceGet(&device, d));

  CUcontext context;
  CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device));

  CUBLAShandle handle;
  CU_ERROR_CHECK(cuBLASCreate(&handle));

  alpha = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
  beta = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;

  if (transA == CBlasNoTrans) {
    lda = (m + 1u) & ~1u;
    if ((A = malloc(lda * k * sizeof(float complex))) == NULL) {
      fputs("Unable to allocate A\n", stderr);
      return -1;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(float complex), k, sizeof(float complex)));
    dlda /= sizeof(float complex);

    for (size_t j = 0; j < k; j++) {
      for (size_t i = 0; i < m; i++)
        A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex),
                           m * sizeof(float complex), k };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }
  else {
    lda = (k + 1u) & ~1u;
    if ((A = malloc(lda * m * sizeof(float complex))) == NULL) {
      fputs("Unable to allocate A\n", stderr);
      return -1;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(float complex), m, sizeof(float complex)));
    dlda /= sizeof(float complex);

    for (size_t j = 0; j < m; j++) {
      for (size_t i = 0; i < k; i++)
        A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex),
                           k * sizeof(float complex), m };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }

  if (transB == CBlasNoTrans) {
    ldb = (k + 1u) & ~1u;
    if ((B = malloc(ldb * n * sizeof(float complex))) == NULL) {
      fputs("Unable to allocate B\n", stderr);
      return -2;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, k * sizeof(float complex), n, sizeof(float complex)));
    dldb /= sizeof(float complex);

    for (size_t j = 0; j < n; j++) {
      for (size_t i = 0; i < k; i++)
        B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex),
                           k * sizeof(float complex), n };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }
  else {
    ldb = (n + 1u) & ~1u;
    if ((B = malloc(ldb * k * sizeof(float complex))) == NULL) {
      fputs("Unable to allocate B\n", stderr);
      return -2;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, n * sizeof(float complex), k, sizeof(float complex)));
    dldb /= sizeof(float complex);

    for (size_t j = 0; j < k; j++) {
      for (size_t i = 0; i < n; i++)
        B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex),
                           n * sizeof(float complex), k };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }

  ldc = (m + 1u) & ~1u;
  if ((C = malloc(ldc * n * sizeof(float complex))) == NULL) {
    fputs("Unable to allocate C\n", stderr);
    return -3;
  }
  if ((refC = malloc(ldc * n * sizeof(float complex))) == NULL) {
    fputs("Unable to allocate refC\n", stderr);
    return -4;
  }
  CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, m * sizeof(float complex), n, sizeof(float complex)));
  dldc /= sizeof(float complex);
  CU_ERROR_CHECK(cuMemAllocPitch(&dD, &dldd, m * sizeof(float complex), n, sizeof(float complex)));
  dldd /= sizeof(float complex);

  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < m; i++)
      refC[j * ldc + i] = C[j * ldc + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I;
  }

  CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex),
                         0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(float complex),
                         m * sizeof(float complex), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  cgemm_ref(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, refC, ldc);
  CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL));

  copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dD, NULL, dldd * sizeof(float complex),
           0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex),
           m * sizeof(float complex), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  float rdiff = 0.0f, idiff = 0.0f;
  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < m; i++) {
      float d = fabsf(crealf(C[j * ldc + i]) - crealf(refC[j * ldc + i]));
      if (d > rdiff)
        rdiff = d;
      d = fabsf(cimagf(C[j * ldc + i]) - cimagf(refC[j * ldc + i]));
      if (d > idiff)
        idiff = d;
    }
  }

  CUevent start, stop;
  CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC));
  CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC));

  CU_ERROR_CHECK(cuEventRecord(start, NULL));
  for (size_t i = 0; i < 20; i++)
    CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL));
  CU_ERROR_CHECK(cuEventRecord(stop, NULL));
  CU_ERROR_CHECK(cuEventSynchronize(stop));

  float time;
  CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
  time /= 20;

  CU_ERROR_CHECK(cuEventDestroy(start));
  CU_ERROR_CHECK(cuEventDestroy(stop));

  size_t flops = k * 6 + (k - 1) * 2;   // k multiplies and k - 1 adds per element
  if (alpha != 1.0f + 0.0f * I)
    flops += 6;                 // additional multiply by alpha
  if (beta != 0.0f + 0.0f * I)
    flops += 8;                 // additional multiply and add by beta
  float error = (float)flops * 2.0f * FLT_EPSILON;     // maximum per element error
  flops *= m * n;               // m * n elements

  bool passed = (rdiff <= error) && (idiff <= error);
  fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f,
          ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL");

  free(A);
  free(B);
  free(C);
  free(refC);
  CU_ERROR_CHECK(cuMemFree(dA));
  CU_ERROR_CHECK(cuMemFree(dB));
  CU_ERROR_CHECK(cuMemFree(dC));
  CU_ERROR_CHECK(cuMemFree(dD));

  CU_ERROR_CHECK(cuBLASDestroy(handle));

  CU_ERROR_CHECK(cuCtxDestroy(context));

  return (int)!passed;
}
Exemple #10
0
int main(int argc, char * argv[]) {
  CBlasUplo uplo;
  CBlasTranspose trans;
  size_t n, k;
  int d = 0;

  if (argc < 5 || argc > 6) {
    fprintf(stderr, "Usage: %s <uplo> <trans> <n> <k> [device]\n"
                    "where:\n"
                    "  uplo     is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n"
                    "  trans    are 'n' or 'N' for CBlasNoTrans or 't' or 'T' for CBlasTrans\n"
                    "  n and k  are the sizes of the matrices\n"
                    "  device   is the GPU to use (default 0)\n", argv[0]);
    return 1;
  }

  char u;
  if (sscanf(argv[1], "%c", &u) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[1]);
    return 1;
  }
  switch (u) {
    case 'U': case 'u': uplo = CBlasUpper; break;
    case 'L': case 'l': uplo = CBlasLower; break;
    default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1;
  }

  char t;
  if (sscanf(argv[2], "%c", &t) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[2]);
    return 2;
  }
  switch (t) {
    case 'N': case 'n': trans = CBlasNoTrans; break;
    case 'T': case 't': trans = CBlasTrans; break;
    case 'C': case 'c': trans = CBlasConjTrans; break;
    default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 2;
  }

  if (sscanf(argv[3], "%zu", &n) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]);
    return 3;
  }

  if (sscanf(argv[4], "%zu", &k) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]);
    return 4;
  }

  if (argc > 5) {
    if (sscanf(argv[5], "%d", &d) != 1) {
      fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]);
      return 5;
    }
  }

  srand(0);

  double alpha, beta, * A, * C, * refC;
  CUdeviceptr dA, dC;
  size_t lda, ldc, dlda, dldc;

  CU_ERROR_CHECK(cuInit(0));

  CUdevice device;
  CU_ERROR_CHECK(cuDeviceGet(&device, d));

  CUcontext context;
  CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device));

  CUBLAShandle handle;
  CU_ERROR_CHECK(cuBLASCreate(&handle));

  alpha = (double)rand() / (double)RAND_MAX;
  beta = (double)rand() / (double)RAND_MAX;

  if (trans == CBlasNoTrans) {
    lda = (n + 1u) & ~1u;
    if ((A = malloc(lda * k * sizeof(double))) == NULL) {
      fputs("Unable to allocate A\n", stderr);
      return -1;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), k, sizeof(double)));
    dlda /= sizeof(double);

    for (size_t j = 0; j < k; j++) {
      for (size_t i = 0; i < n; i++)
        A[j * lda + i] = (double)rand() / (double)RAND_MAX;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double),
                           n * sizeof(double), k };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }
  else {
    lda = (k + 1u) & ~1u;
    if ((A = malloc(lda * n * sizeof(double))) == NULL) {
      fputs("Unable to allocate A\n", stderr);
      return -1;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(double), n, sizeof(double)));
    dlda /= sizeof(double);

    for (size_t j = 0; j < n; j++) {
      for (size_t i = 0; i < k; i++)
        A[j * lda + i] = (double)rand() / (double)RAND_MAX;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double),
                           k * sizeof(double), n };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }

  ldc = (n + 1u) & ~1u;
  if ((C = malloc(ldc * n * sizeof(double))) == NULL) {
    fputs("Unable to allocate C\n", stderr);
    return -3;
  }
  if ((refC = malloc(ldc * n * sizeof(double))) == NULL) {
    fputs("Unable to allocate refC\n", stderr);
    return -4;
  }
  CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, n * sizeof(double), n, sizeof(double)));
  dldc /= sizeof(double);

  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < n; i++)
      refC[j * ldc + i] = C[j * ldc + i] = (double)rand() / (double)RAND_MAX;
  }

  CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double),
                         0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double),
                         n * sizeof(double), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  dsyrk_ref(uplo, trans, n, k, alpha, A, lda, beta, refC, ldc);
  CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL));

  copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double),
           0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double),
           n * sizeof(double), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  double diff = 0.0;
  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < n; i++) {
      double d = fabs(C[j * ldc + i] - refC[j * ldc + i]);
      if (d > diff)
        diff = d;
    }
  }

  CUevent start, stop;
  CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC));
  CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC));

  CU_ERROR_CHECK(cuEventRecord(start, NULL));
  for (size_t i = 0; i < 20; i++)
    CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL));
  CU_ERROR_CHECK(cuEventRecord(stop, NULL));
  CU_ERROR_CHECK(cuEventSynchronize(stop));

  float time;
  CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
  time /= 20;

  CU_ERROR_CHECK(cuEventDestroy(start));
  CU_ERROR_CHECK(cuEventDestroy(stop));

  size_t flops = 2 * k - 1;     // k multiplies and k - 1 adds per element
  if (alpha != 1.0)
    flops += 1;                 // additional multiply by alpha
  if (beta != 0.0)
    flops += 2;                 // additional multiply and add by beta
  double error = (double)flops * 2.0 * DBL_EPSILON;   // maximum per element error
  flops *= n * (n + 1) / 2;     // n(n + 1) / 2 elements

  bool passed = (diff <= error);
  fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f,
          ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL");

  free(A);
  free(C);
  free(refC);
  CU_ERROR_CHECK(cuMemFree(dA));
  CU_ERROR_CHECK(cuMemFree(dC));

  CU_ERROR_CHECK(cuBLASDestroy(handle));

  CU_ERROR_CHECK(cuCtxDestroy(context));

  return (int)!passed;
}
Exemple #11
0
int main(int argc, char * argv[]) {
  CBlasUplo uplo;
  size_t n;
  int d = 0;

  if (argc < 3 || argc > 4) {
    fprintf(stderr, "Usage: %s <uplo> <n>\n"
                    "where:\n"
                    "  uplo    is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n"
                    "  n       is the size of the matrix\n"
                    "  device  is the GPU to use (default 0)\n", argv[0]);
    return 1;
  }

  char u;
  if (sscanf(argv[1], "%c", &u) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[1]);
    return 1;
  }
  switch (u) {
    case 'U': case 'u': uplo = CBlasUpper; break;
    case 'L': case 'l': uplo = CBlasLower; break;
    default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1;
  }

  if (sscanf(argv[2], "%zu", &n) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[2]);
    return 2;
  }

  if (argc > 3) {
    if (sscanf(argv[3], "%d", &d) != 1) {
      fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]);
      return 3;
    }
  }

  srand(0);

  double * A, * refA;
  CUdeviceptr dA;
  size_t lda, dlda;
  long info, rInfo;

  CU_ERROR_CHECK(cuInit(0));

  CUdevice device;
  CU_ERROR_CHECK(cuDeviceGet(&device, d));

  CUcontext context;
  CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device));

  CULAPACKhandle handle;
  CU_ERROR_CHECK(cuLAPACKCreate(&handle));

  lda = (n + 1u) & ~1u;
  if ((A = malloc(lda *  n * sizeof(double))) == NULL) {
    fputs("Unable to allocate A\n", stderr);
    return -1;
  }
  if ((refA = malloc(lda * n * sizeof(double))) == NULL) {
    fputs("Unable to allocate refA\n", stderr);
    return -2;
  }
  CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), n, sizeof(double)));
  dlda /= sizeof(double);

  if (dlatmc(n, 2.0, A, lda) != 0) {
    fputs("Unable to initialise A\n", stderr);
    return -1;
  }

//   dpotrf(uplo, n, A, lda, &info);
//   if (info != 0) {
//     fputs("Failed to compute Cholesky decomposition of A\n", stderr);
//     return (int)info;
//   }

  for (size_t j = 0; j < n; j++)
    memcpy(&refA[j * lda], &A[j * lda], n * sizeof(double));

  CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double),
                         0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double),
                         n * sizeof(double), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  dlauum_ref(uplo, n, refA, lda, &rInfo);
  CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info));

  copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double),
                          0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double),
                          n * sizeof(double), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  bool passed = (info == rInfo);
  double diff = 0.0;
  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < n; i++) {
      double d = fabs(A[j * lda + i] - refA[j * lda + i]);
      if (d > diff)
        diff = d;
    }
  }

  // Set A to identity so that repeated applications of the cholesky
  // decomposition while benchmarking do not exit early due to
  // non-positive-definite-ness.
  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < n; i++)
      A[j * lda + i] = (i == j) ? 1.0 : 0.0;
  }

  copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double),
                          0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double),
                          n * sizeof(double), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  CUevent start, stop;
  CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC));
  CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC));

  CU_ERROR_CHECK(cuEventRecord(start, NULL));
  for (size_t i = 0; i < 20; i++)
    CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info));
  CU_ERROR_CHECK(cuEventRecord(stop, NULL));
  CU_ERROR_CHECK(cuEventSynchronize(stop));

  float time;
  CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
  time /= 20;

  CU_ERROR_CHECK(cuEventDestroy(start));
  CU_ERROR_CHECK(cuEventDestroy(stop));

  const size_t flops = ((n * n * n) / 3) + ((n * n) / 2) + (n / 6);
  fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f,
          ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL");

  free(A);
  free(refA);
  CU_ERROR_CHECK(cuMemFree(dA));

  CU_ERROR_CHECK(cuLAPACKDestroy(handle));

  CU_ERROR_CHECK(cuCtxDestroy(context));

  return (int)!passed;
}
Exemple #12
0
bool EGLInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
{
    // plane is always 0 because frame is rgb
    AutoCtxLock locker((cuda_api*)this, lock);
    Q_UNUSED(locker);
    if (!ensureResource(w, h, param.Reserved[0], H, tex)) // TODO surface size instead of frame size because we copy the device data
        return false;
    //CUDA_ENSURE(cuCtxPushCurrent(ctx), false);
    CUdeviceptr devptr;
    unsigned int pitch;

    CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), false);
    CUVIDAutoUnmapper unmapper(this, dec, devptr);
    Q_UNUSED(unmapper);
    // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE
    CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false);
    CUarray array;
    CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false);
    CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false); // mapped array still accessible!

    CUDA_MEMCPY2D cu2d;
    memset(&cu2d, 0, sizeof(cu2d));
    // Y plane
    cu2d.srcDevice = devptr;
    cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
    cu2d.srcPitch = pitch;
    cu2d.dstArray = array;
    cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY;
    cu2d.dstPitch = pitch;
    // the whole size or copy size?
    cu2d.WidthInBytes = res[plane].W; // the same value as texture9_nv12
    cu2d.Height = H*3/2;
    if (res[plane].stream)
        CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false);
    else
        CUDA_ENSURE(cuMemcpy2D(&cu2d), false);
    //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)?
    // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error
    // so I simply unmap the resource here
    if (WORKAROUND_UNMAP_CONTEXT_SWITCH) {
        if (res[plane].stream) {
            //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery?
            CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize
        }
        /*
         * This function provides the synchronization guarantee that any CUDA work issued
         * in \p stream before ::cuGraphicsUnmapResources() will complete before any
         * subsequently issued graphics work begins.
         * The graphics API from which \p resources were registered
         * should not access any resources while they are mapped by CUDA. If an
         * application does so, the results are undefined.
         */
//        CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
    }
    D3DLOCKED_RECT rect_src, rect_dst;
    DX_ENSURE(texture9_nv12->LockRect(0, &rect_src, NULL, D3DLOCK_READONLY), false);
    DX_ENSURE(surface9_nv12->LockRect(&rect_dst, NULL, D3DLOCK_DISCARD), false);
    memcpy(rect_dst.pBits, rect_src.pBits, res[plane].W*H*3/2); // exactly w and h
    DX_ENSURE(surface9_nv12->UnlockRect(), false);
    DX_ENSURE(texture9_nv12->UnlockRect(0), false);
#if 0
    //IDirect3DSurface9 *raw_surface = NULL;
    //DX_ENSURE(texture9_nv12->GetSurfaceLevel(0, &raw_surface), false);
    const RECT src = { 0, 0, w, h*3/2};
    DX_ENSURE(device9->StretchRect(raw_surface, &src, surface9_nv12, NULL, D3DTEXF_NONE), false);
#endif
    if (!map(surface9_nv12, tex, w, h, H))
        return false;
    return true;
}
Exemple #13
0
bool GLInteropResource::map(int picIndex, const CUVIDPROCPARAMS &param, GLuint tex, int w, int h, int H, int plane)
{
    AutoCtxLock locker((cuda_api*)this, lock);
    Q_UNUSED(locker);
    if (!ensureResource(w, h, H, tex, plane)) // TODO surface size instead of frame size because we copy the device data
        return false;
    //CUDA_ENSURE(cuCtxPushCurrent(ctx), false);
    CUdeviceptr devptr;
    unsigned int pitch;

    CUDA_ENSURE(cuvidMapVideoFrame(dec, picIndex, &devptr, &pitch, const_cast<CUVIDPROCPARAMS*>(&param)), false);
    CUVIDAutoUnmapper unmapper(this, dec, devptr);
    Q_UNUSED(unmapper);
    // TODO: why can not use res[plane].stream? CUDA_ERROR_INVALID_HANDLE
    CUDA_ENSURE(cuGraphicsMapResources(1, &res[plane].cuRes, 0), false);
    CUarray array;
    CUDA_ENSURE(cuGraphicsSubResourceGetMappedArray(&array, res[plane].cuRes, 0, 0), false);

    CUDA_MEMCPY2D cu2d;
    memset(&cu2d, 0, sizeof(cu2d));
    cu2d.srcDevice = devptr;
    cu2d.srcMemoryType = CU_MEMORYTYPE_DEVICE;
    cu2d.srcPitch = pitch;
    cu2d.dstArray = array;
    cu2d.dstMemoryType = CU_MEMORYTYPE_ARRAY;
    cu2d.dstPitch = pitch;
    // the whole size or copy size?
    cu2d.WidthInBytes = pitch;
    cu2d.Height = h;
    if (plane == 1) {
        cu2d.srcXInBytes = 0;// +srcY*srcPitch + srcXInBytes
        cu2d.srcY = H; // skip the padding height
        cu2d.Height /= 2;
    }
    if (res[plane].stream)
        CUDA_ENSURE(cuMemcpy2DAsync(&cu2d, res[plane].stream), false);
    else
        CUDA_ENSURE(cuMemcpy2D(&cu2d), false);
    //TODO: delay cuCtxSynchronize && unmap. do it in unmap(tex)?
    // map to an already mapped resource will crash. sometimes I can not unmap the resource in unmap(tex) because if context switch error
    // so I simply unmap the resource here
    if (WORKAROUND_UNMAP_CONTEXT_SWITCH) {
        if (res[plane].stream) {
            //CUDA_WARN(cuCtxSynchronize(), false); //wait too long time? use cuStreamQuery?
            CUDA_WARN(cuStreamSynchronize(res[plane].stream)); //slower than CtxSynchronize
        }
        /*
         * This function provides the synchronization guarantee that any CUDA work issued
         * in \p stream before ::cuGraphicsUnmapResources() will complete before any
         * subsequently issued graphics work begins.
         * The graphics API from which \p resources were registered
         * should not access any resources while they are mapped by CUDA. If an
         * application does so, the results are undefined.
         */
        CUDA_ENSURE(cuGraphicsUnmapResources(1, &res[plane].cuRes, 0), false);
    } else {
        // call it at last. current context will be used by other cuda calls (unmap() for example)
        CUDA_ENSURE(cuCtxPopCurrent(&ctx), false); // not required
    }
    return true;
}
Exemple #14
0
bool ResourceCUDA::updateResourceFromCUDA(CUstream streamID)
{
    CUresult res;
    res = cuGraphicsMapResources( 1, &m_cudaResource, streamID );
    if(res)
    {
        LOGE("Error>> CUDA failed map some target resources\n");
        return false;
    }
    //
    // Walk through output resources and perform the copies
    //
    CUarray cuArray;
//#   define DBGDUMMYCOPY
#   ifdef DBGDUMMYCOPY
    // interop has issues... let's compare with a copy to a basic cuda array
    int www = m_xByteSz/4;
    CUDA_ARRAY_DESCRIPTOR descr = {
            www,//unsigned int Width;
            m_creationData.sz[1],//unsigned int Height;
            CU_AD_FORMAT_UNSIGNED_INT8,//CUarray_format Format;
            4//unsigned int NumChannels;
        };
    res = cuArrayCreate(&cuArray, &descr);
#   else
    res = cuGraphicsSubResourceGetMappedArray( &cuArray, m_cudaResource, 0/*arrayIndex*/, 0/*mipLevel*/);
#   endif
    if(res)
    {
        res = cuGraphicsUnmapResources( 1, &m_cudaResource, streamID );
        return false;
    }
    CUDA_MEMCPY2D copyInfo = {
        0,                    ///< Source X in bytes
        0,                    ///< Source Y
        CU_MEMORYTYPE_DEVICE,//< Source memory type (host, device, array)
        NULL,                ///< Source host pointer
        m_dptr,            ///< Source device pointer
        NULL,                ///< Source array reference
        m_pitch,            ///< Source pitch (ignored when src is array)

        0,                    ///< Destination X in bytes
        0,                    ///< Destination Y
        CU_MEMORYTYPE_ARRAY,///< Destination memory type (host, device, array)
        NULL,                ///< Destination host pointer
        NULL,                ///< Destination device pointer
        cuArray,            ///< Destination array reference
        0,                    ///< Destination pitch (ignored when dst is array)

        m_xByteSz,        ///< Width of 2D memory copy in bytes
        m_creationData.sz[1]             ///< Height of 2D memory copy
    };
    //LOGI("cuMemcpy2D(): CU_MEMORYTYPE_DEVICE source=%x pitch=%d CU_MEMORYTYPE_ARRAY=%x widthBytes=%d height=%d\n",m_dptr, m_pitch, cuArray, m_xByteSz, m_creationData.sz[1]);
    res = cuMemcpy2D( &copyInfo );
    if(res)
    {
        LOGE("Error>> CUDA failed to copy linear memory to texture (array memory)\n");
        res = cuGraphicsUnmapResources( 1, &m_cudaResource, streamID );
        return false;
    }
#   ifdef DBGDUMMYCOPY
    res = cuArrayDestroy(cuArray);
#   endif
    res = cuGraphicsUnmapResources( 1, &m_cudaResource, streamID );
    if(res)
    {
        LOGE("Error>> CUDA failed unmap the resource for output result of the kernel\n");
        return false;
    }
    return true;
}
Exemple #15
0
int main(int argc, char * argv[]) {
  CBlasSide side;
  CBlasUplo uplo;
  CBlasTranspose trans;
  CBlasDiag diag;
  size_t m, n;
  int d = 0;

  if (argc < 7 || argc > 8) {
    fprintf(stderr, "Usage: %s <side> <uplo> <trans> <diag> <m> <n> [device]\n"
                    "where:\n"
                    "  side     is 'l' or 'L' for CBlasLeft and 'r' or 'R' for CBlasRight\n"
                    "  uplo     is 'u' or 'U' for CBlasUpper and 'l' or 'L' for CBlasLower\n"
                    "  trans    is 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n"
                    "  diag     is 'n' or 'N' for CBlasNonUnit and 'u' or 'U' for CBlasUnit\n"
                    "  m and n  are the sizes of the matrices\n"
                    "  device   is the GPU to use (default 0)\n", argv[0]);
    return 1;
  }

  char s;
  if (sscanf(argv[1], "%c", &s) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[1]);
    return 1;
  }
  switch (s) {
    case 'L': case 'l': side = CBlasLeft; break;
    case 'R': case 'r': side = CBlasRight; break;
    default: fprintf(stderr, "Unknown side '%c'\n", s); return 1;
  }

  char u;
  if (sscanf(argv[2], "%c", &u) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[2]);
    return 2;
  }
  switch (u) {
    case 'U': case 'u': uplo = CBlasUpper; break;
    case 'L': case 'l': uplo = CBlasLower; break;
    default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 2;
  }

  char t;
  if (sscanf(argv[3], "%c", &t) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[3]);
    return 3;
  }
  switch (t) {
    case 'N': case 'n': trans = CBlasNoTrans; break;
    case 'T': case 't': trans = CBlasTrans; break;
    case 'C': case 'c': trans = CBlasConjTrans; break;
    default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 3;
  }

  char di;
  if (sscanf(argv[4], "%c", &di) != 1) {
    fprintf(stderr, "Unable to read character from '%s'\n", argv[4]);
    return 4;
  }
  switch (di) {
    case 'N': case 'n': diag = CBlasNonUnit; break;
    case 'U': case 'u': diag = CBlasUnit; break;
    default: fprintf(stderr, "Unknown diag '%c'\n", t); return 4;
  }

  if (sscanf(argv[5], "%zu", &m) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]);
    return 5;
  }

  if (sscanf(argv[6], "%zu", &n) != 1) {
    fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]);
    return 6;
  }

  if (argc > 7) {
    if (sscanf(argv[7], "%d", &d) != 1) {
      fprintf(stderr, "Unable to parse number from '%s'\n", argv[7]);
      return 7;
    }
  }

  srand(0);

  double complex alpha, * A, * B, * refB;
  CUdeviceptr dA, dB, dX;
  size_t lda, ldb, dlda, dldb, dldx;

  CU_ERROR_CHECK(cuInit(0));

  CUdevice device;
  CU_ERROR_CHECK(cuDeviceGet(&device, d));

  CUcontext context;
  CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device));

  CUBLAShandle handle;
  CU_ERROR_CHECK(cuBLASCreate(&handle));

  alpha = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I;

  if (side == CBlasLeft) {
    lda = m;
    if ((A = malloc(lda * m * sizeof(double complex))) == NULL) {
      fputs("Unable to allocate A\n", stderr);
      return -1;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(double complex), m, sizeof(double complex)));
    dlda /= sizeof(double complex);

    for (size_t j = 0; j < m; j++) {
      for (size_t i = 0; i < m; i++)
        A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex),
                           m * sizeof(double complex), m };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }
  else {
    lda = n;
    if ((A = malloc(lda * n * sizeof(double complex))) == NULL) {
      fputs("Unable to allocate A\n", stderr);
      return -1;
    }
    CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double complex), n, sizeof(double complex)));
    dlda /= sizeof(double complex);

    for (size_t j = 0; j < n; j++) {
      for (size_t i = 0; i < n; i++)
        A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I;
    }

    CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex),
                           0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex),
                           n * sizeof(double complex), n };
    CU_ERROR_CHECK(cuMemcpy2D(&copy));
  }

  ldb = m;
  if ((B = malloc(ldb * n * sizeof(double complex))) == NULL) {
    fputs("Unable to allocate B\n", stderr);
    return -3;
  }
  if ((refB = malloc(ldb * n * sizeof(double complex))) == NULL) {
    fputs("Unable to allocate refB\n", stderr);
    return -4;
  }
  CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, m * sizeof(double complex), n, sizeof(double complex)));
  dldb /= sizeof(double complex);
  CU_ERROR_CHECK(cuMemAllocPitch(&dX, &dldx, m * sizeof(double complex), n, sizeof(double complex)));
  dldx /= sizeof(double complex);

  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < m; i++)
      refB[j * ldb + i] = B[j * ldb + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I;
  }

  CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex),
                         0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(double complex),
                         m * sizeof(double complex), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  ztrmm_ref(side, uplo, trans, diag, m, n, alpha, A, lda, refB, ldb);
  CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL));

  copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dX, NULL, dldx * sizeof(double complex),
                          0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex),
                          m * sizeof(double complex), n };
  CU_ERROR_CHECK(cuMemcpy2D(&copy));

  bool passed = true;
  double rdiff = 0.0, idiff = 0.0;
  for (size_t j = 0; j < n; j++) {
    for (size_t i = 0; i < m; i++) {
      double d = fabs(creal(B[j * ldb + i]) - creal(refB[j * ldb + i]));
      if (d > rdiff)
        rdiff = d;

      double c = fabs(cimag(B[j * ldb + i]) - cimag(refB[j * ldb + i]));
      if (c > idiff)
        idiff = c;

      size_t flops;
      if (side == CBlasLeft)
        flops = 2 * i + 1;
      else
        flops = 2 * j + 1;
      if (diag == CBlasNonUnit)
        flops++;
      flops *= 3;

      if (d > (double)flops * 2.0 * DBL_EPSILON ||
          c > (double)flops * 2.0 * DBL_EPSILON)
        passed = false;
    }
  }

  CUevent start, stop;
  CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC));
  CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC));

  CU_ERROR_CHECK(cuEventRecord(start, NULL));
  for (size_t i = 0; i < 20; i++)
    CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL));
  CU_ERROR_CHECK(cuEventRecord(stop, NULL));
  CU_ERROR_CHECK(cuEventSynchronize(stop));

  float time;
  CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
  time /= 20;

  CU_ERROR_CHECK(cuEventDestroy(start));
  CU_ERROR_CHECK(cuEventDestroy(stop));

  const size_t flops = (side == CBlasLeft) ?
                        (6 * (n * m * (m + 1) / 2) + 2 * (n * m * (m - 1) / 2)) :
                        (6 * (m * n * (n + 1) / 2) + 2 * (m * n * (n - 1) / 2));

  fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f,
          ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL");

  free(A);
  free(B);
  free(refB);
  CU_ERROR_CHECK(cuMemFree(dA));
  CU_ERROR_CHECK(cuMemFree(dB));
  CU_ERROR_CHECK(cuMemFree(dX));

  CU_ERROR_CHECK(cuBLASDestroy(handle));

  CU_ERROR_CHECK(cuCtxDestroy(context));

  return (int)!passed;
}
Exemple #16
0
static int cuvid_decode_frame(AVCodecContext *avctx, void *data, int *got_frame, AVPacket *avpkt)
{
    CuvidContext *ctx = avctx->priv_data;
    AVHWDeviceContext *device_ctx = (AVHWDeviceContext*)ctx->hwdevice->data;
    AVCUDADeviceContext *device_hwctx = device_ctx->hwctx;
    CUcontext dummy, cuda_ctx = device_hwctx->cuda_ctx;
    AVFrame *frame = data;
    CUVIDSOURCEDATAPACKET cupkt;
    AVPacket filter_packet = { 0 };
    AVPacket filtered_packet = { 0 };
    CUdeviceptr mapped_frame = 0;
    int ret = 0, eret = 0;

    if (ctx->bsf && avpkt->size) {
        if ((ret = av_packet_ref(&filter_packet, avpkt)) < 0) {
            av_log(avctx, AV_LOG_ERROR, "av_packet_ref failed\n");
            return ret;
        }

        if ((ret = av_bsf_send_packet(ctx->bsf, &filter_packet)) < 0) {
            av_log(avctx, AV_LOG_ERROR, "av_bsf_send_packet failed\n");
            av_packet_unref(&filter_packet);
            return ret;
        }

        if ((ret = av_bsf_receive_packet(ctx->bsf, &filtered_packet)) < 0) {
            av_log(avctx, AV_LOG_ERROR, "av_bsf_receive_packet failed\n");
            return ret;
        }

        avpkt = &filtered_packet;
    }

    ret = CHECK_CU(cuCtxPushCurrent(cuda_ctx));
    if (ret < 0) {
        av_packet_unref(&filtered_packet);
        return ret;
    }

    memset(&cupkt, 0, sizeof(cupkt));

    if (avpkt->size) {
        cupkt.payload_size = avpkt->size;
        cupkt.payload = avpkt->data;

        if (avpkt->pts != AV_NOPTS_VALUE) {
            cupkt.flags = CUVID_PKT_TIMESTAMP;
            if (avctx->pkt_timebase.num && avctx->pkt_timebase.den)
                cupkt.timestamp = av_rescale_q(avpkt->pts, avctx->pkt_timebase, (AVRational){1, 10000000});
            else
                cupkt.timestamp = avpkt->pts;
        }
    } else {
        cupkt.flags = CUVID_PKT_ENDOFSTREAM;
    }

    ret = CHECK_CU(cuvidParseVideoData(ctx->cuparser, &cupkt));

    av_packet_unref(&filtered_packet);

    if (ret < 0) {
        goto error;
    }

    // cuvidParseVideoData doesn't return an error just because stuff failed...
    if (ctx->internal_error) {
        av_log(avctx, AV_LOG_ERROR, "cuvid decode callback error\n");
        ret = ctx->internal_error;
        goto error;
    }

    if (av_fifo_size(ctx->frame_queue)) {
        CUVIDPARSERDISPINFO dispinfo;
        CUVIDPROCPARAMS params;
        unsigned int pitch = 0;
        int offset = 0;
        int i;

        av_fifo_generic_read(ctx->frame_queue, &dispinfo, sizeof(CUVIDPARSERDISPINFO), NULL);

        memset(&params, 0, sizeof(params));
        params.progressive_frame = dispinfo.progressive_frame;
        params.second_field = 0;
        params.top_field_first = dispinfo.top_field_first;

        ret = CHECK_CU(cuvidMapVideoFrame(ctx->cudecoder, dispinfo.picture_index, &mapped_frame, &pitch, &params));
        if (ret < 0)
            goto error;

        if (avctx->pix_fmt == AV_PIX_FMT_CUDA) {
            ret = av_hwframe_get_buffer(ctx->hwframe, frame, 0);
            if (ret < 0) {
                av_log(avctx, AV_LOG_ERROR, "av_hwframe_get_buffer failed\n");
                goto error;
            }

            ret = ff_decode_frame_props(avctx, frame);
            if (ret < 0) {
                av_log(avctx, AV_LOG_ERROR, "ff_decode_frame_props failed\n");
                goto error;
            }

            for (i = 0; i < 2; i++) {
                CUDA_MEMCPY2D cpy = {
                    .srcMemoryType = CU_MEMORYTYPE_DEVICE,
                    .dstMemoryType = CU_MEMORYTYPE_DEVICE,
                    .srcDevice     = mapped_frame,
                    .dstDevice     = (CUdeviceptr)frame->data[i],
                    .srcPitch      = pitch,
                    .dstPitch      = frame->linesize[i],
                    .srcY          = offset,
                    .WidthInBytes  = FFMIN(pitch, frame->linesize[i]),
                    .Height        = avctx->coded_height >> (i ? 1 : 0),
                };

                ret = CHECK_CU(cuMemcpy2D(&cpy));
                if (ret < 0)
                    goto error;

                offset += avctx->coded_height;
            }
        } else if (avctx->pix_fmt == AV_PIX_FMT_NV12) {
Exemple #17
0
CUresult I_WRAP_SONAME_FNNAME_ZZ(libcudaZdsoZa, cuMemcpy2DAsync)(const CUDA_MEMCPY2D *pCopy, CUstream hStream) {
    int error = 0;
    long vgErrorAddress;

    vgErrorAddress = VALGRIND_CHECK_MEM_IS_DEFINED(&hStream, sizeof(CUstream));
    if (vgErrorAddress) {
        error++;
        VALGRIND_PRINTF("Error: 'hStream' in call to cuMemcpy2DAsync not defined.\n");
    }

    cgLock();

    CUcontext      ctx = NULL;
    cgGetCtx(&ctx);

    // Check if destination (device) memory/array is already being written to.
    switch (pCopy->dstMemoryType) {
    case CU_MEMORYTYPE_DEVICE: {
        cgMemListType  *nodeMem;

        nodeMem = cgFindMem(cgFindCtx(ctx), pCopy->dstDevice);

        if (nodeMem) {
            // Are we trying to read a memory region that's being written by diffrent stream?
            if (nodeMem->locked & 2 && nodeMem->stream != hStream) {
                error++;
                VALGRIND_PRINTF("Error: Concurrent write and read access by different streams.\n");
            }

            nodeMem->locked = nodeMem->locked | 1;
            nodeMem->stream = hStream;
        }

        break;
    }
    case CU_MEMORYTYPE_ARRAY: {
        cgArrListType  *nodeArr;

        nodeArr = cgFindArr(cgFindCtx(ctx), pCopy->dstArray);

        if (nodeArr) {
            // Are we trying to read an array that's being written by different stream?
            if (nodeArr->locked & 2 && nodeArr->stream != hStream) {
                error++;
                VALGRIND_PRINTF("Error: Concurrent write and read access to array by different streams.\n");
            }

            nodeArr->locked = nodeArr->locked | 1;
            nodeArr->stream = hStream;
        }

        break;
    }
    }

    // Check if source (device) memory/array is already being written to/read from.
    switch (pCopy->srcMemoryType) {
    case CU_MEMORYTYPE_DEVICE: {
        cgMemListType  *nodeMem;

        nodeMem = cgFindMem(cgFindCtx(ctx), pCopy->srcDevice);

        if (nodeMem) {
            // Are we trying to read a memory region that's being written by diffrent stream?
            if (nodeMem->locked && nodeMem->stream != hStream) {
                error++;
                VALGRIND_PRINTF("Error: Concurrent write and read access by different streams.\n");
            }

            nodeMem->locked = nodeMem->locked | 2;
            nodeMem->stream = hStream;
        }

        break;
    }
    case CU_MEMORYTYPE_ARRAY: {
        cgArrListType  *nodeArr;

        nodeArr = cgFindArr(cgFindCtx(ctx), pCopy->srcArray);

        if (nodeArr) {
            // Are we trying to read an array that's being written by different stream?
            if (nodeArr->locked && nodeArr->stream != hStream) {
                error++;
                VALGRIND_PRINTF("Error: Concurrent write and read access to array by different streams.\n");
            }

            nodeArr->locked = nodeArr->locked | 2;
            nodeArr->stream = hStream;
        }

        break;
    }
    }

    cgUnlock();

    if (error) {
        VALGRIND_PRINTF_BACKTRACE("");
    }

    return cuMemcpy2D(pCopy);
}
int main() {
  CU_ERROR_CHECK(cuInit(0));

  int count;
  CU_ERROR_CHECK(cuDeviceGetCount(&count));

  for (int i = 0; i < count; i++) {
    CUdevice device;
    CU_ERROR_CHECK(cuDeviceGet(&device, i));

    int memoryClockRate, globalMemoryBusWidth;
    CU_ERROR_CHECK(cuDeviceGetAttribute(&memoryClockRate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device));
    CU_ERROR_CHECK(cuDeviceGetAttribute(&globalMemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device));

    // Calculate pin bandwidth in bytes/sec (clock rate is actual in kHz, memory is DDR so multiply clock rate by 2.e3 to get effective clock rate in Hz)
    double pinBandwidth = memoryClockRate * 2.e3 * (globalMemoryBusWidth / CHAR_BIT);

    CUcontext context;
    CU_ERROR_CHECK(cuCtxCreate(&context, 0, device));

    fprintf(stdout, "Device %d (pin bandwidth %6.2f GB/s):\n", i, pinBandwidth / (1 << 30));

    CUDA_MEMCPY2D copy;
    copy.srcMemoryType = CU_MEMORYTYPE_DEVICE;
    copy.dstMemoryType = CU_MEMORYTYPE_DEVICE;

    CUevent start, stop;
    CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_DEFAULT));
    CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_DEFAULT));

    float time;

    // Calculate aligned copy for 32, 64 and 128-bit word sizes
    for (unsigned int j = 4; j <= 16; j *= 2) {
      copy.WidthInBytes = SIZE;
      copy.Height = 1;

      copy.srcXInBytes = 0;
      copy.srcY = 0;
      copy.dstXInBytes = 0;
      copy.dstY = 0;

      CU_ERROR_CHECK(cuMemAllocPitch(&copy.srcDevice, &copy.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j));
      CU_ERROR_CHECK(cuMemAllocPitch(&copy.dstDevice, &copy.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j));

      CU_ERROR_CHECK(cuEventRecord(start, 0));
      for (size_t i = 0; i < ITERATIONS; i++)
        CU_ERROR_CHECK(cuMemcpy2D(&copy));
      CU_ERROR_CHECK(cuEventRecord(stop, 0));
      CU_ERROR_CHECK(cuEventSynchronize(stop));
      CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
      time /= ITERATIONS * 1.e3f;
      double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time;

      fprintf(stdout, "\taligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0);

      CU_ERROR_CHECK(cuMemFree(copy.srcDevice));
      CU_ERROR_CHECK(cuMemFree(copy.dstDevice));
    }

    // Calculate misaligned copy for 32, 64 and 128-bit word sizes
    for (unsigned int j = 4; j <= 16; j *= 2) {
      copy.WidthInBytes = SIZE;
      copy.Height = 1;

      copy.srcXInBytes = j;
      copy.srcY = 0;
      copy.dstXInBytes = j;
      copy.dstY = 0;

      CU_ERROR_CHECK(cuMemAllocPitch(&copy.srcDevice, &copy.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j));
      CU_ERROR_CHECK(cuMemAllocPitch(&copy.dstDevice, &copy.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j));

      CU_ERROR_CHECK(cuEventRecord(start, 0));
      for (size_t j = 0; j < ITERATIONS; j++)
        CU_ERROR_CHECK(cuMemcpy2D(&copy));
      CU_ERROR_CHECK(cuEventRecord(stop, 0));
      CU_ERROR_CHECK(cuEventSynchronize(stop));
      CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
      time /= ITERATIONS * 1.e3f;
      double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time;

      fprintf(stdout, "\tmisaligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0);

      CU_ERROR_CHECK(cuMemFree(copy.srcDevice));
      CU_ERROR_CHECK(cuMemFree(copy.dstDevice));
    }

    // Calculate stride-2 copy for 32, 64 and 128-bit word sizes
    for (unsigned int j = 4; j <= 16; j *= 2) {
      copy.WidthInBytes = SIZE / 2;
      copy.Height = 1;

      copy.srcXInBytes = 0;
      copy.srcY = 0;
      copy.dstXInBytes = 0;
      copy.dstY = 0;

      CU_ERROR_CHECK(cuMemAllocPitch(&copy.srcDevice, &copy.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j));
      CU_ERROR_CHECK(cuMemAllocPitch(&copy.dstDevice, &copy.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j));

      copy.srcPitch *= 2;
      copy.dstPitch *= 2;

      CU_ERROR_CHECK(cuEventRecord(start, 0));
      for (size_t i = 0; i < ITERATIONS; i++)
        CU_ERROR_CHECK(cuMemcpy2D(&copy));
      CU_ERROR_CHECK(cuEventRecord(stop, 0));
      CU_ERROR_CHECK(cuEventSynchronize(stop));
      CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
      time /= ITERATIONS * 1.e3f;
      double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time;

      fprintf(stdout, "\tstride-2 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0);

      CU_ERROR_CHECK(cuMemFree(copy.srcDevice));
      CU_ERROR_CHECK(cuMemFree(copy.dstDevice));
    }

    // Calculate stride-10 copy for 32, 64 and 128-bit word sizes
    for (unsigned int j = 4; j <= 16; j *= 2) {
      copy.WidthInBytes = SIZE / 10;
      copy.Height = 1;

      copy.srcXInBytes = 0;
      copy.srcY = 0;
      copy.dstXInBytes = 0;
      copy.dstY = 0;

      CU_ERROR_CHECK(cuMemAllocPitch(&copy.srcDevice, &copy.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j));
      CU_ERROR_CHECK(cuMemAllocPitch(&copy.dstDevice, &copy.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j));

      copy.srcPitch *= 10;
      copy.dstPitch *= 10;

      CU_ERROR_CHECK(cuEventRecord(start, 0));
      for (size_t i = 0; i < ITERATIONS; i++)
        CU_ERROR_CHECK(cuMemcpy2D(&copy));
      CU_ERROR_CHECK(cuEventRecord(stop, 0));
      CU_ERROR_CHECK(cuEventSynchronize(stop));
      CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
      time /= ITERATIONS * 1.e3f;
      double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time;

      fprintf(stdout, "\tstride-10 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0);

      CU_ERROR_CHECK(cuMemFree(copy.srcDevice));
      CU_ERROR_CHECK(cuMemFree(copy.dstDevice));
    }

    // Calculate stride-1000 copy for 32, 64 and 128-bit word sizes
    for (unsigned int j = 4; j <= 16; j *= 2) {
      copy.WidthInBytes = SIZE / 1000;
      copy.Height = 1;

      copy.srcXInBytes = 0;
      copy.srcY = 0;
      copy.dstXInBytes = 0;
      copy.dstY = 0;

      CU_ERROR_CHECK(cuMemAllocPitch(&copy.srcDevice, &copy.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j));
      CU_ERROR_CHECK(cuMemAllocPitch(&copy.dstDevice, &copy.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j));

      copy.srcPitch *= 1000;
      copy.dstPitch *= 1000;

      CU_ERROR_CHECK(cuEventRecord(start, 0));
      for (size_t j = 0; j < ITERATIONS; j++)
        CU_ERROR_CHECK(cuMemcpy2D(&copy));
      CU_ERROR_CHECK(cuEventRecord(stop, 0));
      CU_ERROR_CHECK(cuEventSynchronize(stop));
      CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop));
      time /= ITERATIONS * 1.e3f;
      double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time;

      fprintf(stdout, "\tstride-1000 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0);

      CU_ERROR_CHECK(cuMemFree(copy.srcDevice));
      CU_ERROR_CHECK(cuMemFree(copy.dstDevice));
    }

    CU_ERROR_CHECK(cuEventDestroy(start));
    CU_ERROR_CHECK(cuEventDestroy(stop));

    CU_ERROR_CHECK(cuCtxDestroy(context));
  }

  return 0;
}