void *swanMallocPitch( size_t *pitch_in_bytes, size_t width_in_bytes, size_t height ) { /* void *ptr; ptr = swanMalloc( width_in_bytes * height ); *pitch_in_bytes = width_in_bytes; return ptr; */ CUdeviceptr dptr; CUresult err; void *ptr; if( width_in_bytes == 0 || height == 0 ) { // printf("SWAN: WARNING: swanMAllocPitch called with 0 argument\n" ); return NULL; } err = cuMemAllocPitch( &dptr, (size_t*) pitch_in_bytes, (size_t) width_in_bytes, height, sizeof(float4) ); if ( err != CUDA_SUCCESS ) { error("swanMallocPitch failed\n" ); } ptr = (void*)(size_t) dptr; return (void*) ptr; }
NVENCSTATUS VideoEncoder::AllocateIOBuffers(EncodeConfig* pEncodeConfig) { NVENCSTATUS nvStatus = NV_ENC_SUCCESS; m_uEncodeBufferCount = pEncodeConfig->numB + 4; uint32_t uInputWidth = pEncodeConfig->width; uint32_t uInputHeight = pEncodeConfig->height; m_EncodeBufferQueue.Initialize(m_stEncodeBuffer, m_uEncodeBufferCount); //Allocate input buffer for (uint32_t i = 0; i < m_uEncodeBufferCount; i++) { __cu(cuvidCtxLock(m_ctxLock, 0)); __cu(cuMemAllocPitch(&m_stEncodeBuffer[i].stInputBfr.pNV12devPtr, (size_t*)&m_stEncodeBuffer[i].stInputBfr.uNV12Stride, uInputWidth, uInputHeight * 3 / 2, 16)); __cu(cuvidCtxUnlock(m_ctxLock, 0)); nvStatus = m_pNvHWEncoder->NvEncRegisterResource(NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR, (void*)m_stEncodeBuffer[i].stInputBfr.pNV12devPtr, uInputWidth, uInputHeight, m_stEncodeBuffer[i].stInputBfr.uNV12Stride, &m_stEncodeBuffer[i].stInputBfr.nvRegisteredResource); if (nvStatus != NV_ENC_SUCCESS) return nvStatus; m_stEncodeBuffer[i].stInputBfr.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL; m_stEncodeBuffer[i].stInputBfr.dwWidth = uInputWidth; m_stEncodeBuffer[i].stInputBfr.dwHeight = uInputHeight; nvStatus = m_pNvHWEncoder->NvEncCreateBitstreamBuffer(BITSTREAM_BUFFER_SIZE, &m_stEncodeBuffer[i].stOutputBfr.hBitstreamBuffer); if (nvStatus != NV_ENC_SUCCESS) return nvStatus; m_stEncodeBuffer[i].stOutputBfr.dwBitstreamBufferSize = BITSTREAM_BUFFER_SIZE; #if defined(NV_WINDOWS) nvStatus = m_pNvHWEncoder->NvEncRegisterAsyncEvent(&m_stEncodeBuffer[i].stOutputBfr.hOutputEvent); if (nvStatus != NV_ENC_SUCCESS) return nvStatus; m_stEncodeBuffer[i].stOutputBfr.bWaitOnEvent = true; #else m_stEncodeBuffer[i].stOutputBfr.hOutputEvent = NULL; #endif } m_stEOSOutputBfr.bEOSFlag = TRUE; #if defined(NV_WINDOWS) nvStatus = m_pNvHWEncoder->NvEncRegisterAsyncEvent(&m_stEOSOutputBfr.hOutputEvent); if (nvStatus != NV_ENC_SUCCESS) return nvStatus; #else m_stEOSOutputBfr.hOutputEvent = NULL; #endif return NV_ENC_SUCCESS; }
//---------------------------------------------------------------- /// setup for interop : prt refers to a texture in which we write /// results. This must be seen as a linear buffer. So we need to /// allocate a temporary linear buffer that will be then copied /// back to the texture // bool ResourceCUDA::setupAsCUDATarget() { int fmtSz = ResourceFormatByteSize(m_creationData.fmt); CUresult res; m_xByteSz = m_creationData.sz[0] * fmtSz; m_size = m_xByteSz * m_creationData.sz[1]; if(m_dptr) { res = cuMemFree(m_dptr); if(res) return false; } res = cuMemAllocPitch( &m_dptr, &m_pitch, m_xByteSz, m_creationData.sz[1], 4); if(res) return false; float pitchToSendToKernel = (float)m_pitch / (float)fmtSz; #pragma MESSAGE("TODO TODO TODO TODO TODO TODO : send pitch to the kernel !") LOGI("Event>>cuMemAllocPitch : Pitch of Target buffer (%d, %d) allocation = %d, %f\n", m_creationData.sz[0], m_xByteSz, m_pitch, pitchToSendToKernel); // // Register the texture to CUDA to be able to copy data back in it // GLenum target; if(m_cudaResource) { res = cuGraphicsUnregisterResource(m_cudaResource); if(res) return false; } switch(m_type) { case RESTEX_1D: target = GL_TEXTURE_1D; break; case RESTEX_2D: target = GL_TEXTURE_2D; break; case RESTEX_2DRECT: case RESRBUF_2D: case RESOURCE_UNKNOWN: //case RESTEX_3D: //case RESTEX_CUBE_MAP: default: LOGE("Failed to register the resource %s for CUDA : may be a render buffer\n", m_name.c_str()); return false; }; res = cuGraphicsGLRegisterImage( &m_cudaResource, m_OGLId, target, CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD ); if(res) { LOGE("Failed to register the texture %s for CUDA (as write discard)\n", m_name.c_str()); return 0; } return true; }
SEXP R_auto_cuMemAllocPitch(SEXP r_WidthInBytes, SEXP r_Height, SEXP r_ElementSizeBytes) { SEXP r_ans = R_NilValue; CUdeviceptr dptr; size_t pPitch; size_t WidthInBytes = REAL(r_WidthInBytes)[0]; size_t Height = REAL(r_Height)[0]; unsigned int ElementSizeBytes = REAL(r_ElementSizeBytes)[0]; CUresult ans; ans = cuMemAllocPitch(& dptr, & pPitch, WidthInBytes, Height, ElementSizeBytes); if(ans) return(R_cudaErrorInfo(ans)); PROTECT(r_ans = NEW_LIST(2)); SEXP r_names; PROTECT(r_names = NEW_CHARACTER(2)); SET_VECTOR_ELT(r_ans, 0, R_createRef((void*) dptr, "CUdeviceptr")); SET_VECTOR_ELT(r_ans, 1, ScalarReal(pPitch)); SET_STRING_ELT(r_names, 0, mkChar("dptr")); SET_STRING_ELT(r_names, 1, mkChar("pPitch")); SET_NAMES(r_ans, r_names); UNPROTECT(2); return(r_ans); }
int main(int argc, char * argv[]) { CBlasUplo uplo; CBlasTranspose trans; size_t n, k; int d = 0; if (argc < 5 || argc > 6) { fprintf(stderr, "Usage: %s <uplo> <trans> <n> <k> [device]\n" "where:\n" " uplo is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n" " trans are 'n' or 'N' for CBlasNoTrans or 't' or 'T' for CBlasTrans\n" " n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char u; if (sscanf(argv[1], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1; } char t; if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': trans = CBlasNoTrans; break; case 'T': case 't': trans = CBlasTrans; break; case 'C': case 'c': trans = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 2; } if (sscanf(argv[3], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (argc > 5) { if (sscanf(argv[5], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } } srand(0); double alpha, beta, * A, * C, * refC; CUdeviceptr dA, dC; size_t lda, ldc, dlda, dldc; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = (double)rand() / (double)RAND_MAX; beta = (double)rand() / (double)RAND_MAX; if (trans == CBlasNoTrans) { lda = (n + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), k, sizeof(double))); dlda /= sizeof(double); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(double), n, sizeof(double))); dlda /= sizeof(double); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), k * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (n + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(double))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(double))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, n * sizeof(double), n, sizeof(double))); dldc /= sizeof(double); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) refC[j * ldc + i] = C[j * ldc + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); dsyrk_ref(uplo, trans, n, k, alpha, A, lda, beta, refC, ldc); CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); double diff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) { double d = fabs(C[j * ldc + i] - refC[j * ldc + i]); if (d > diff) diff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = 2 * k - 1; // k multiplies and k - 1 adds per element if (alpha != 1.0) flops += 1; // additional multiply by alpha if (beta != 0.0) flops += 2; // additional multiply and add by beta double error = (double)flops * 2.0 * DBL_EPSILON; // maximum per element error flops *= n * (n + 1) / 2; // n(n + 1) / 2 elements bool passed = (diff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
CNvidiaNvencCodec(DWORD nCodecInstanceId, const CCodecContextBase& CodecContext) : m_NvidiaNvencCodecContext(static_cast<const CNvidiaNvencCodecContext&>(CodecContext)), m_hNvEncodeAPI64(LoadLibraryA("nvEncodeAPI64.dll")) { PNVENCODEAPICREATEINSTANCE pNvEncodeAPICreateInstance = reinterpret_cast<PNVENCODEAPICREATEINSTANCE>(GetProcAddress(m_hNvEncodeAPI64, "NvEncodeAPICreateInstance")); memset(&m_FunctionList, 0, sizeof(m_FunctionList)); m_FunctionList.version = NV_ENCODE_API_FUNCTION_LIST_VER; NVENCSTATUS nStatus = pNvEncodeAPICreateInstance(&m_FunctionList); CHECK_CUDA_DRV_STATUS(cuCtxCreate(&m_Context, 0, 0)); if (m_NvidiaNvencCodecContext.GetUseSwscaleInsteadOfCuda()) { CHECK_CUDA_DRV_STATUS(cuMemAlloc(&m_pNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2)); m_nNv12BufferPitch = m_NvidiaNvencCodecContext.GetWidth(); CHECK_CUDA_DRV_STATUS(cuMemAllocHost(&m_pPageLockedNv12Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 3 / 2)); m_pNv12Planes[0] = reinterpret_cast<unsigned char*>(m_pPageLockedNv12Buffer); m_pNv12Planes[1] = reinterpret_cast<unsigned char*>(m_pPageLockedNv12Buffer) + m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight(); m_pNv12Strides[0] = m_NvidiaNvencCodecContext.GetWidth(); m_pNv12Strides[1] = m_NvidiaNvencCodecContext.GetWidth(); m_SwscaleContext = sws_getContext(m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), AV_PIX_FMT_BGR32, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), AV_PIX_FMT_NV12, 0, 0, 0, 0); } else { CHECK_CUDA_DRV_STATUS(cuMemAllocPitch(&m_pNv12Buffer, &m_nNv12BufferPitch, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight() * 3 / 2, 16)); if (m_NvidiaNvencCodecContext.GetUsePageLockedIntermediateBuffer()) { CHECK_CUDA_DRV_STATUS(cuMemAllocHost(&m_pPageLockedRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4)); } CHECK_CUDA_DRV_STATUS(cuMemAlloc(&m_pRgb32Buffer, m_NvidiaNvencCodecContext.GetWidth() * m_NvidiaNvencCodecContext.GetHeight() * 4)); } CHECK_CUDA_DRV_STATUS(cuStreamCreate(&m_Stream, 0)); NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS SessionParameters; memset(&SessionParameters, 0, sizeof(SessionParameters)); SessionParameters.version = NV_ENC_OPEN_ENCODE_SESSION_EX_PARAMS_VER; SessionParameters.apiVersion = NVENCAPI_VERSION; SessionParameters.device = m_Context; SessionParameters.deviceType = NV_ENC_DEVICE_TYPE_CUDA; nStatus = m_FunctionList.nvEncOpenEncodeSessionEx(&SessionParameters, &m_pEncoder); m_PictureParameters.version = NV_ENC_PIC_PARAMS_VER; auto PresetGuid = NV_ENC_PRESET_HP_GUID; NV_ENC_PRESET_CONFIG PresetConfiguration = { NV_ENC_PRESET_CONFIG_VER, 0 }; PresetConfiguration.presetCfg.version = NV_ENC_CONFIG_VER; CHECK_NVENC_STATUS(m_FunctionList.nvEncGetEncodePresetConfig(m_pEncoder, NV_ENC_CODEC_H264_GUID, PresetGuid, &PresetConfiguration)); NV_ENC_CONFIG EncoderConfiguration = { NV_ENC_CONFIG_VER, 0 }; EncoderConfiguration = PresetConfiguration.presetCfg; EncoderConfiguration.gopLength = NVENC_INFINITE_GOPLENGTH; EncoderConfiguration.profileGUID = NV_ENC_H264_PROFILE_BASELINE_GUID; EncoderConfiguration.frameIntervalP = 1; // No B frames EncoderConfiguration.frameFieldMode = NV_ENC_PARAMS_FRAME_FIELD_MODE_FRAME; EncoderConfiguration.encodeCodecConfig.h264Config.idrPeriod = m_NvidiaNvencCodecContext.GetFrameCount(); EncoderConfiguration.encodeCodecConfig.h264Config.chromaFormatIDC = 1; EncoderConfiguration.encodeCodecConfig.h264Config.sliceMode = 0; EncoderConfiguration.encodeCodecConfig.h264Config.sliceModeData = 0; NV_ENC_INITIALIZE_PARAMS InitializationParameters = { NV_ENC_INITIALIZE_PARAMS_VER, 0 }; InitializationParameters.encodeGUID = NV_ENC_CODEC_H264_GUID; InitializationParameters.presetGUID = PresetGuid; InitializationParameters.frameRateNum = m_NvidiaNvencCodecContext.GetFps(); InitializationParameters.frameRateDen = 1; #ifdef ASYNCHRONOUS InitializationParameters.enableEncodeAsync = 1; #else InitializationParameters.enableEncodeAsync = 0; #endif InitializationParameters.enablePTD = 1; // Let the encoder decide the picture type InitializationParameters.reportSliceOffsets = 0; InitializationParameters.maxEncodeWidth = m_NvidiaNvencCodecContext.GetWidth(); InitializationParameters.maxEncodeHeight = m_NvidiaNvencCodecContext.GetHeight(); InitializationParameters.encodeConfig = &EncoderConfiguration; InitializationParameters.encodeWidth = m_NvidiaNvencCodecContext.GetWidth(); InitializationParameters.encodeHeight = m_NvidiaNvencCodecContext.GetHeight(); InitializationParameters.darWidth = 16; InitializationParameters.darHeight = 9; CHECK_NVENC_STATUS(m_FunctionList.nvEncInitializeEncoder(m_pEncoder, &InitializationParameters)); // Picture parameters that are known ahead of encoding m_PictureParameters = { NV_ENC_PIC_PARAMS_VER, 0 }; m_PictureParameters.codecPicParams.h264PicParams.sliceMode = 0; m_PictureParameters.codecPicParams.h264PicParams.sliceModeData = 0; m_PictureParameters.inputWidth = m_NvidiaNvencCodecContext.GetWidth(); m_PictureParameters.inputHeight = m_NvidiaNvencCodecContext.GetHeight(); m_PictureParameters.bufferFmt = NV_ENC_BUFFER_FORMAT_NV12_PL; m_PictureParameters.inputPitch = static_cast<uint32_t>(m_nNv12BufferPitch); m_PictureParameters.pictureStruct = NV_ENC_PIC_STRUCT_FRAME; #ifdef ASYNCHRONOUS m_hCompletionEvent = CreateEvent(NULL, FALSE, FALSE, NULL); m_EventParameters = { NV_ENC_EVENT_PARAMS_VER, 0 }; m_EventParameters.completionEvent = m_hCompletionEvent; CHECK_NVENC_STATUS(m_FunctionList.nvEncRegisterAsyncEvent(m_pEncoder, &m_EventParameters)); m_PictureParameters.completionEvent = m_hCompletionEvent; #endif // Register CUDA input pointer NV_ENC_REGISTER_RESOURCE RegisterResource = { NV_ENC_REGISTER_RESOURCE_VER, NV_ENC_INPUT_RESOURCE_TYPE_CUDADEVICEPTR, m_NvidiaNvencCodecContext.GetWidth(), m_NvidiaNvencCodecContext.GetHeight(), static_cast<uint32_t>(m_nNv12BufferPitch), 0, reinterpret_cast<void*>(m_pNv12Buffer), NULL, NV_ENC_BUFFER_FORMAT_NV12_PL }; CHECK_NVENC_STATUS(m_FunctionList.nvEncRegisterResource(m_pEncoder, &RegisterResource)); NV_ENC_MAP_INPUT_RESOURCE MapInputResource = { NV_ENC_MAP_INPUT_RESOURCE_VER, 0, 0, RegisterResource.registeredResource }; m_pRegisteredResource = RegisterResource.registeredResource; CHECK_NVENC_STATUS(m_FunctionList.nvEncMapInputResource(m_pEncoder, &MapInputResource)); m_PictureParameters.inputBuffer = MapInputResource.mappedResource; // Create output bitstream buffer m_nOutputBitstreamSize = 2 * 1024 * 1024; NV_ENC_CREATE_BITSTREAM_BUFFER CreateBitstreamBuffer = { NV_ENC_CREATE_BITSTREAM_BUFFER_VER, m_nOutputBitstreamSize, NV_ENC_MEMORY_HEAP_AUTOSELECT, 0 }; CHECK_NVENC_STATUS(m_FunctionList.nvEncCreateBitstreamBuffer(m_pEncoder, &CreateBitstreamBuffer)); m_pOutputBitstream = CreateBitstreamBuffer.bitstreamBuffer; m_PictureParameters.outputBitstream = m_pOutputBitstream; if (m_NvidiaNvencCodecContext.GetSaveOutputToFile()) { char pOutputFilename[MAX_PATH]; sprintf_s(pOutputFilename, "nvenc-%d.h264", nCodecInstanceId); if (fopen_s(&m_pOutputFile, pOutputFilename, "wb") != 0) { throw std::runtime_error(std::string("could not open ").append(pOutputFilename).append(" for writing!")); } } }
int main(int argc, char * argv[]) { CBlasSide side; CBlasUplo uplo; CBlasTranspose trans; CBlasDiag diag; size_t m, n; int d = 0; if (argc < 7 || argc > 8) { fprintf(stderr, "Usage: %s <side> <uplo> <trans> <diag> <m> <n> [device]\n" "where:\n" " side is 'l' or 'L' for CBlasLeft and 'r' or 'R' for CBlasRight\n" " uplo is 'u' or 'U' for CBlasUpper and 'l' or 'L' for CBlasLower\n" " trans is 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " diag is 'n' or 'N' for CBlasNonUnit and 'u' or 'U' for CBlasUnit\n" " m and n are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char s; if (sscanf(argv[1], "%c", &s) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (s) { case 'L': case 'l': side = CBlasLeft; break; case 'R': case 'r': side = CBlasRight; break; default: fprintf(stderr, "Unknown side '%c'\n", s); return 1; } char u; if (sscanf(argv[2], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 2; } char t; if (sscanf(argv[3], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[3]); return 3; } switch (t) { case 'N': case 'n': trans = CBlasNoTrans; break; case 'T': case 't': trans = CBlasTrans; break; case 'C': case 'c': trans = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 3; } char di; if (sscanf(argv[4], "%c", &di) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[4]); return 4; } switch (di) { case 'N': case 'n': diag = CBlasNonUnit; break; case 'U': case 'u': diag = CBlasUnit; break; default: fprintf(stderr, "Unknown diag '%c'\n", t); return 4; } if (sscanf(argv[5], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (sscanf(argv[6], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } if (argc > 7) { if (sscanf(argv[7], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[7]); return 7; } } srand(0); double complex alpha, * A, * B, * refB; CUdeviceptr dA, dB, dX; size_t lda, ldb, dlda, dldb, dldx; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; if (side == CBlasLeft) { lda = m; if ((A = malloc(lda * m * sizeof(double complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(double complex), m, sizeof(double complex))); dlda /= sizeof(double complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex), m * sizeof(double complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = n; if ((A = malloc(lda * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double complex), n, sizeof(double complex))); dlda /= sizeof(double complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex), n * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldb = m; if ((B = malloc(ldb * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -3; } if ((refB = malloc(ldb * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate refB\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, m * sizeof(double complex), n, sizeof(double complex))); dldb /= sizeof(double complex); CU_ERROR_CHECK(cuMemAllocPitch(&dX, &dldx, m * sizeof(double complex), n, sizeof(double complex))); dldx /= sizeof(double complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refB[j * ldb + i] = B[j * ldb + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(double complex), m * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); ztrmm_ref(side, uplo, trans, diag, m, n, alpha, A, lda, refB, ldb); CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dX, NULL, dldx * sizeof(double complex), 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex), m * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); bool passed = true; double rdiff = 0.0, idiff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { double d = fabs(creal(B[j * ldb + i]) - creal(refB[j * ldb + i])); if (d > rdiff) rdiff = d; double c = fabs(cimag(B[j * ldb + i]) - cimag(refB[j * ldb + i])); if (c > idiff) idiff = c; size_t flops; if (side == CBlasLeft) flops = 2 * i + 1; else flops = 2 * j + 1; if (diag == CBlasNonUnit) flops++; flops *= 3; if (d > (double)flops * 2.0 * DBL_EPSILON || c > (double)flops * 2.0 * DBL_EPSILON) passed = false; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); const size_t flops = (side == CBlasLeft) ? (6 * (n * m * (m + 1) / 2) + 2 * (n * m * (m - 1) / 2)) : (6 * (m * n * (n + 1) / 2) + 2 * (m * n * (n - 1) / 2)); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(refB); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dX)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }
int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); for (int i = 0; i < count; i++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, i)); int memoryClockRate, globalMemoryBusWidth; CU_ERROR_CHECK(cuDeviceGetAttribute(&memoryClockRate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); CU_ERROR_CHECK(cuDeviceGetAttribute(&globalMemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); // Calculate pin bandwidth in bytes/sec (clock rate is actual in kHz, memory is DDR so multiply clock rate by 2.e3 to get effective clock rate in Hz) double pinBandwidth = memoryClockRate * 2.e3 * (globalMemoryBusWidth / CHAR_BIT); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); fprintf(stdout, "Device %d (pin bandwidth %6.2f GB/s):\n", i, pinBandwidth / (1 << 30)); CUDA_MEMCPY2D copy; copy.srcMemoryType = CU_MEMORYTYPE_DEVICE; copy.dstMemoryType = CU_MEMORYTYPE_DEVICE; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_DEFAULT)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_DEFAULT)); float time; // Calculate aligned copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\taligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate misaligned copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE; copy.Height = 1; copy.srcXInBytes = j; copy.srcY = 0; copy.dstXInBytes = j; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t j = 0; j < ITERATIONS; j++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tmisaligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-2 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 2; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 2; copy.dstPitch *= 2; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-2 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-10 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 10; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 10; copy.dstPitch *= 10; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-10 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-1000 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 1000; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 1000; copy.dstPitch *= 1000; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t j = 0; j < ITERATIONS; j++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-1000 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
int main(int argc, char * argv[]) { CBlasUplo uplo; size_t n; int d = 0; if (argc < 3 || argc > 4) { fprintf(stderr, "Usage: %s <uplo> <n>\n" "where:\n" " uplo is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n" " n is the size of the matrix\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char u; if (sscanf(argv[1], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1; } if (sscanf(argv[2], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[2]); return 2; } if (argc > 3) { if (sscanf(argv[3], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } } srand(0); double * A, * refA; CUdeviceptr dA; size_t lda, dlda; long info, rInfo; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CULAPACKhandle handle; CU_ERROR_CHECK(cuLAPACKCreate(&handle)); lda = (n + 1u) & ~1u; if ((A = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } if ((refA = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate refA\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), n, sizeof(double))); dlda /= sizeof(double); if (dlatmc(n, 2.0, A, lda) != 0) { fputs("Unable to initialise A\n", stderr); return -1; } // dpotrf(uplo, n, A, lda, &info); // if (info != 0) { // fputs("Failed to compute Cholesky decomposition of A\n", stderr); // return (int)info; // } for (size_t j = 0; j < n; j++) memcpy(&refA[j * lda], &A[j * lda], n * sizeof(double)); CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); dlauum_ref(uplo, n, refA, lda, &rInfo); CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); bool passed = (info == rInfo); double diff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) { double d = fabs(A[j * lda + i] - refA[j * lda + i]); if (d > diff) diff = d; } } // Set A to identity so that repeated applications of the cholesky // decomposition while benchmarking do not exit early due to // non-positive-definite-ness. for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (i == j) ? 1.0 : 0.0; } copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); const size_t flops = ((n * n * n) / 3) + ((n * n) / 2) + (n / 6); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(refA); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuLAPACKDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
int main(int argc, char * argv[]) { CBlasTranspose transA, transB; size_t m, n, k; int d = 0; if (argc < 6 || argc > 7) { fprintf(stderr, "Usage: %s <transA> <transB> <m> <n> <k> [device]\n" "where:\n" " transA and transB are 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " m, n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char t; if (sscanf(argv[1], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (t) { case 'N': case 'n': transA = CBlasNoTrans; break; case 'T': case 't': transA = CBlasTrans; break; case 'C': case 'c': transA = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': transB = CBlasNoTrans; break; case 'T': case 't': transB = CBlasTrans; break; case 'C': case 'c': transB = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[3], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (sscanf(argv[5], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (argc > 6) { if (sscanf(argv[6], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } } srand(0); float complex alpha, beta, * A, * B, * C, * refC; CUdeviceptr dA, dB, dC, dD; size_t lda, ldb, ldc, dlda, dldb, dldc, dldd; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; beta = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; if (transA == CBlasNoTrans) { lda = (m + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(float complex), k, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), m * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * m * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(float complex), m, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), k * sizeof(float complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } if (transB == CBlasNoTrans) { ldb = (k + 1u) & ~1u; if ((B = malloc(ldb * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, k * sizeof(float complex), n, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), k * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { ldb = (n + 1u) & ~1u; if ((B = malloc(ldb * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, n * sizeof(float complex), k, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), n * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (m + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, m * sizeof(float complex), n, sizeof(float complex))); dldc /= sizeof(float complex); CU_ERROR_CHECK(cuMemAllocPitch(&dD, &dldd, m * sizeof(float complex), n, sizeof(float complex))); dldd /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refC[j * ldc + i] = C[j * ldc + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); cgemm_ref(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, refC, ldc); CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dD, NULL, dldd * sizeof(float complex), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); float rdiff = 0.0f, idiff = 0.0f; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { float d = fabsf(crealf(C[j * ldc + i]) - crealf(refC[j * ldc + i])); if (d > rdiff) rdiff = d; d = fabsf(cimagf(C[j * ldc + i]) - cimagf(refC[j * ldc + i])); if (d > idiff) idiff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = k * 6 + (k - 1) * 2; // k multiplies and k - 1 adds per element if (alpha != 1.0f + 0.0f * I) flops += 6; // additional multiply by alpha if (beta != 0.0f + 0.0f * I) flops += 8; // additional multiply and add by beta float error = (float)flops * 2.0f * FLT_EPSILON; // maximum per element error flops *= m * n; // m * n elements bool passed = (rdiff <= error) && (idiff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuMemFree(dD)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }