float ApexCudaProfileSession::flushProfileInfo(ProfileData& pd) { CUevent start = (CUevent)pd.start; CUevent stop = (CUevent)pd.stop; uint32_t op = 1; float startTf = 0.f, stopTf = 0.f; uint64_t startT = 0, stopT = 0; CUT_SAFE_CALL(cuEventSynchronize(start)); CUT_SAFE_CALL(cuEventElapsedTime(&startTf, (CUevent)mTimer, start)); startT = static_cast<uint64_t>(startTf * mManager->mTimeFormat) ; mMemBuf.write(&op, sizeof(op)); mMemBuf.write(&startT, sizeof(startT)); mMemBuf.write(&pd.id, sizeof(pd.id)); op = 2; CUT_SAFE_CALL(cuEventSynchronize((CUevent)stop)); CUT_SAFE_CALL(cuEventElapsedTime(&stopTf, (CUevent)mTimer, (CUevent)stop)); stopT = static_cast<uint64_t>(stopTf * mManager->mTimeFormat); mMemBuf.write(&op, sizeof(op)); mMemBuf.write(&stopT, sizeof(stopT)); mMemBuf.write(&pd.id, sizeof(pd.id)); CUT_SAFE_CALL(cuEventDestroy((CUevent)start)); CUT_SAFE_CALL(cuEventDestroy((CUevent)stop)); mFrameStart = PxMin(mFrameStart, startTf); mFrameFinish = PxMax(mFrameFinish, stopTf); return stopTf - startTf; }
static int cuda_sync(gpudata *b) { cuda_context *ctx = (cuda_context *)b->ctx; int err = GA_NO_ERROR; ASSERT_BUF(b); cuda_enter(ctx); ctx->err = cuEventSynchronize(b->wev); if (ctx->err != CUDA_SUCCESS) err = GA_IMPL_ERROR; ctx->err = cuEventSynchronize(b->rev); if (ctx->err != CUDA_SUCCESS) err = GA_IMPL_ERROR; cuda_exit(ctx); return err; }
static int cuda_read(void *dst, gpudata *src, size_t srcoff, size_t sz) { cuda_context *ctx = src->ctx; ASSERT_BUF(src); if (sz == 0) return GA_NO_ERROR; if ((src->sz - srcoff) < sz) return GA_VALUE_ERROR; cuda_enter(ctx); if (src->flags & CUDA_MAPPED_PTR) { ctx->err = cuEventSynchronize(src->wev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } memcpy(dst, (void *)(src->ptr + srcoff), sz); } else { cuda_waits(src, CUDA_WAIT_READ, ctx->mem_s); ctx->err = cuMemcpyDtoHAsync(dst, src->ptr + srcoff, sz, ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_records(src, CUDA_WAIT_READ, ctx->mem_s); } cuda_exit(ctx); return GA_NO_ERROR; }
static int cuda_write(gpudata *dst, size_t dstoff, const void *src, size_t sz) { cuda_context *ctx = dst->ctx; ASSERT_BUF(dst); if (sz == 0) return GA_NO_ERROR; if ((dst->sz - dstoff) < sz) return GA_VALUE_ERROR; cuda_enter(ctx); if (dst->flags & CUDA_MAPPED_PTR) { ctx->err = cuEventSynchronize(dst->rev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } memcpy((void *)(dst->ptr + dstoff), src, sz); } else { cuda_waits(dst, CUDA_WAIT_WRITE, ctx->mem_s); ctx->err = cuMemcpyHtoDAsync(dst->ptr + dstoff, src, sz, ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); return GA_IMPL_ERROR; } cuda_records(dst, CUDA_WAIT_WRITE, ctx->mem_s); } cuda_exit(ctx); return GA_NO_ERROR; }
double device_t<CUDA>::timeBetween(const tag &startTag, const tag &endTag){ cuEventSynchronize(endTag.cuEvent); float msTimeTaken; cuEventElapsedTime(&msTimeTaken, startTag.cuEvent, endTag.cuEvent); return (double) (1.0e-3 * (double) msTimeTaken); }
/** * This measures the overhead in launching a kernel function on each GPU in the * system. * * It does this by executing a small kernel (copying 1 value in global memory) a * very large number of times and taking the average execution time. This * program uses the CUDA driver API. */ int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); float x = 5.0f; for (int d = 0; d < count; d++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); CUdeviceptr in, out; CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float))); CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float))); CUmodule module; CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes)); CUfunction function; CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel")); void * params[] = { &in, &out }; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, 0)); CU_ERROR_CHECK(cuEventCreate(&stop, 0)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (int i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuMemFree(in)); CU_ERROR_CHECK(cuMemFree(out)); fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS)); CU_ERROR_CHECK(cuModuleUnload(module)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
double kernel_t<CUDA>::timeTakenBetween(void *start, void *end){ CUevent &startEvent = *((CUevent*) start); CUevent &endEvent = *((CUevent*) end); cuEventSynchronize(endEvent); float msTimeTaken; cuEventElapsedTime(&msTimeTaken, startEvent, endEvent); return (1.0e-3 * msTimeTaken); }
SEXP R_auto_cuEventSynchronize(SEXP r_hEvent) { SEXP r_ans = R_NilValue; CUevent hEvent = (CUevent) getRReference(r_hEvent); CUresult ans; ans = cuEventSynchronize(hEvent); r_ans = Renum_convert_CUresult(ans) ; return(r_ans); }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }
int main(int argc, char * argv[]) { CBlasTranspose transA, transB; size_t m, n, k; int d = 0; if (argc < 6 || argc > 7) { fprintf(stderr, "Usage: %s <transA> <transB> <m> <n> <k> [device]\n" "where:\n" " transA and transB are 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " m, n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char t; if (sscanf(argv[1], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (t) { case 'N': case 'n': transA = CBlasNoTrans; break; case 'T': case 't': transA = CBlasTrans; break; case 'C': case 'c': transA = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': transB = CBlasNoTrans; break; case 'T': case 't': transB = CBlasTrans; break; case 'C': case 'c': transB = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[3], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (sscanf(argv[5], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (argc > 6) { if (sscanf(argv[6], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } } srand(0); float complex alpha, beta, * A, * B, * C, * refC; CUdeviceptr dA, dB, dC, dD; size_t lda, ldb, ldc, dlda, dldb, dldc, dldd; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; beta = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; if (transA == CBlasNoTrans) { lda = (m + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(float complex), k, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), m * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * m * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(float complex), m, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), k * sizeof(float complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } if (transB == CBlasNoTrans) { ldb = (k + 1u) & ~1u; if ((B = malloc(ldb * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, k * sizeof(float complex), n, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), k * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { ldb = (n + 1u) & ~1u; if ((B = malloc(ldb * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, n * sizeof(float complex), k, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), n * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (m + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, m * sizeof(float complex), n, sizeof(float complex))); dldc /= sizeof(float complex); CU_ERROR_CHECK(cuMemAllocPitch(&dD, &dldd, m * sizeof(float complex), n, sizeof(float complex))); dldd /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refC[j * ldc + i] = C[j * ldc + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); cgemm_ref(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, refC, ldc); CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dD, NULL, dldd * sizeof(float complex), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); float rdiff = 0.0f, idiff = 0.0f; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { float d = fabsf(crealf(C[j * ldc + i]) - crealf(refC[j * ldc + i])); if (d > rdiff) rdiff = d; d = fabsf(cimagf(C[j * ldc + i]) - cimagf(refC[j * ldc + i])); if (d > idiff) idiff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = k * 6 + (k - 1) * 2; // k multiplies and k - 1 adds per element if (alpha != 1.0f + 0.0f * I) flops += 6; // additional multiply by alpha if (beta != 0.0f + 0.0f * I) flops += 8; // additional multiply and add by beta float error = (float)flops * 2.0f * FLT_EPSILON; // maximum per element error flops *= m * n; // m * n elements bool passed = (rdiff <= error) && (idiff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuMemFree(dD)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); for (int i = 0; i < count; i++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, i)); int memoryClockRate, globalMemoryBusWidth; CU_ERROR_CHECK(cuDeviceGetAttribute(&memoryClockRate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); CU_ERROR_CHECK(cuDeviceGetAttribute(&globalMemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); // Calculate pin bandwidth in bytes/sec (clock rate is actual in kHz, memory is DDR so multiply clock rate by 2.e3 to get effective clock rate in Hz) double pinBandwidth = memoryClockRate * 2.e3 * (globalMemoryBusWidth / CHAR_BIT); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); fprintf(stdout, "Device %d (pin bandwidth %6.2f GB/s):\n", i, pinBandwidth / (1 << 30)); CUDA_MEMCPY2D copy; copy.srcMemoryType = CU_MEMORYTYPE_DEVICE; copy.dstMemoryType = CU_MEMORYTYPE_DEVICE; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_DEFAULT)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_DEFAULT)); float time; // Calculate aligned copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\taligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate misaligned copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE; copy.Height = 1; copy.srcXInBytes = j; copy.srcY = 0; copy.dstXInBytes = j; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t j = 0; j < ITERATIONS; j++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tmisaligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-2 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 2; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 2; copy.dstPitch *= 2; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-2 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-10 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 10; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 10; copy.dstPitch *= 10; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-10 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-1000 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 1000; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 1000; copy.dstPitch *= 1000; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t j = 0; j < ITERATIONS; j++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-1000 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
int main(int argc, char * argv[]) { CBlasUplo uplo; size_t n; int d = 0; if (argc < 3 || argc > 4) { fprintf(stderr, "Usage: %s <uplo> <n>\n" "where:\n" " uplo is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n" " n is the size of the matrix\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char u; if (sscanf(argv[1], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1; } if (sscanf(argv[2], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[2]); return 2; } if (argc > 3) { if (sscanf(argv[3], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } } srand(0); double * A, * refA; CUdeviceptr dA; size_t lda, dlda; long info, rInfo; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CULAPACKhandle handle; CU_ERROR_CHECK(cuLAPACKCreate(&handle)); lda = (n + 1u) & ~1u; if ((A = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } if ((refA = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate refA\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), n, sizeof(double))); dlda /= sizeof(double); if (dlatmc(n, 2.0, A, lda) != 0) { fputs("Unable to initialise A\n", stderr); return -1; } // dpotrf(uplo, n, A, lda, &info); // if (info != 0) { // fputs("Failed to compute Cholesky decomposition of A\n", stderr); // return (int)info; // } for (size_t j = 0; j < n; j++) memcpy(&refA[j * lda], &A[j * lda], n * sizeof(double)); CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); dlauum_ref(uplo, n, refA, lda, &rInfo); CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); bool passed = (info == rInfo); double diff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) { double d = fabs(A[j * lda + i] - refA[j * lda + i]); if (d > diff) diff = d; } } // Set A to identity so that repeated applications of the cholesky // decomposition while benchmarking do not exit early due to // non-positive-definite-ness. for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (i == j) ? 1.0 : 0.0; } copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); const size_t flops = ((n * n * n) / 3) + ((n * n) / 2) + (n / 6); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(refA); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuLAPACKDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
int main(int argc, char * argv[]) { CBlasUplo uplo; CBlasTranspose trans; size_t n, k; int d = 0; if (argc < 5 || argc > 6) { fprintf(stderr, "Usage: %s <uplo> <trans> <n> <k> [device]\n" "where:\n" " uplo is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n" " trans are 'n' or 'N' for CBlasNoTrans or 't' or 'T' for CBlasTrans\n" " n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char u; if (sscanf(argv[1], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1; } char t; if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': trans = CBlasNoTrans; break; case 'T': case 't': trans = CBlasTrans; break; case 'C': case 'c': trans = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 2; } if (sscanf(argv[3], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (argc > 5) { if (sscanf(argv[5], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } } srand(0); double alpha, beta, * A, * C, * refC; CUdeviceptr dA, dC; size_t lda, ldc, dlda, dldc; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = (double)rand() / (double)RAND_MAX; beta = (double)rand() / (double)RAND_MAX; if (trans == CBlasNoTrans) { lda = (n + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), k, sizeof(double))); dlda /= sizeof(double); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(double), n, sizeof(double))); dlda /= sizeof(double); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), k * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (n + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(double))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(double))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, n * sizeof(double), n, sizeof(double))); dldc /= sizeof(double); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) refC[j * ldc + i] = C[j * ldc + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); dsyrk_ref(uplo, trans, n, k, alpha, A, lda, beta, refC, ldc); CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); double diff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) { double d = fabs(C[j * ldc + i] - refC[j * ldc + i]); if (d > diff) diff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = 2 * k - 1; // k multiplies and k - 1 adds per element if (alpha != 1.0) flops += 1; // additional multiply by alpha if (beta != 0.0) flops += 2; // additional multiply and add by beta double error = (double)flops * 2.0 * DBL_EPSILON; // maximum per element error flops *= n * (n + 1) / 2; // n(n + 1) / 2 elements bool passed = (diff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
int main(int argc, char * argv[]) { CBlasSide side; CBlasUplo uplo; CBlasTranspose trans; CBlasDiag diag; size_t m, n; int d = 0; if (argc < 7 || argc > 8) { fprintf(stderr, "Usage: %s <side> <uplo> <trans> <diag> <m> <n> [device]\n" "where:\n" " side is 'l' or 'L' for CBlasLeft and 'r' or 'R' for CBlasRight\n" " uplo is 'u' or 'U' for CBlasUpper and 'l' or 'L' for CBlasLower\n" " trans is 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " diag is 'n' or 'N' for CBlasNonUnit and 'u' or 'U' for CBlasUnit\n" " m and n are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char s; if (sscanf(argv[1], "%c", &s) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (s) { case 'L': case 'l': side = CBlasLeft; break; case 'R': case 'r': side = CBlasRight; break; default: fprintf(stderr, "Unknown side '%c'\n", s); return 1; } char u; if (sscanf(argv[2], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 2; } char t; if (sscanf(argv[3], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[3]); return 3; } switch (t) { case 'N': case 'n': trans = CBlasNoTrans; break; case 'T': case 't': trans = CBlasTrans; break; case 'C': case 'c': trans = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 3; } char di; if (sscanf(argv[4], "%c", &di) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[4]); return 4; } switch (di) { case 'N': case 'n': diag = CBlasNonUnit; break; case 'U': case 'u': diag = CBlasUnit; break; default: fprintf(stderr, "Unknown diag '%c'\n", t); return 4; } if (sscanf(argv[5], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (sscanf(argv[6], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } if (argc > 7) { if (sscanf(argv[7], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[7]); return 7; } } srand(0); double complex alpha, * A, * B, * refB; CUdeviceptr dA, dB, dX; size_t lda, ldb, dlda, dldb, dldx; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; if (side == CBlasLeft) { lda = m; if ((A = malloc(lda * m * sizeof(double complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(double complex), m, sizeof(double complex))); dlda /= sizeof(double complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex), m * sizeof(double complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = n; if ((A = malloc(lda * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double complex), n, sizeof(double complex))); dlda /= sizeof(double complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex), n * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldb = m; if ((B = malloc(ldb * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -3; } if ((refB = malloc(ldb * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate refB\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, m * sizeof(double complex), n, sizeof(double complex))); dldb /= sizeof(double complex); CU_ERROR_CHECK(cuMemAllocPitch(&dX, &dldx, m * sizeof(double complex), n, sizeof(double complex))); dldx /= sizeof(double complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refB[j * ldb + i] = B[j * ldb + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(double complex), m * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); ztrmm_ref(side, uplo, trans, diag, m, n, alpha, A, lda, refB, ldb); CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dX, NULL, dldx * sizeof(double complex), 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex), m * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); bool passed = true; double rdiff = 0.0, idiff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { double d = fabs(creal(B[j * ldb + i]) - creal(refB[j * ldb + i])); if (d > rdiff) rdiff = d; double c = fabs(cimag(B[j * ldb + i]) - cimag(refB[j * ldb + i])); if (c > idiff) idiff = c; size_t flops; if (side == CBlasLeft) flops = 2 * i + 1; else flops = 2 * j + 1; if (diag == CBlasNonUnit) flops++; flops *= 3; if (d > (double)flops * 2.0 * DBL_EPSILON || c > (double)flops * 2.0 * DBL_EPSILON) passed = false; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); const size_t flops = (side == CBlasLeft) ? (6 * (n * m * (m + 1) / 2) + 2 * (n * m * (m - 1) / 2)) : (6 * (m * n * (n + 1) / 2) + 2 * (m * n * (n - 1) / 2)); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(refB); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dX)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
int main(int argc, char* argv[]) { //int iTest = 2896; //while (iTest < 0x7fff) //{ // int iResult = iTest * iTest; // float fTest = (float)iTest; // int fResult = (int)(fTest * fTest); // printf("i*i:%08x f*f:%08x\n", iResult, fResult); // iTest += 0x0800; //} //exit(0); char deviceName[32]; int devCount, ordinal, major, minor; CUdevice hDevice; // Initialize the Driver API and find a device CUDA_CHECK( cuInit(0) ); CUDA_CHECK( cuDeviceGetCount(&devCount) ); for (ordinal = 0; ordinal < devCount; ordinal++) { CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); if (major >= 5 && minor >= 2) { printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); break; } } if (ordinal == devCount) { printf("No compute 5.0 device found, exiting.\n"); exit(EXIT_FAILURE); } // First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing int internalTiming = 1; if (argc > 1) internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0; // Second command line arg is the number of blocks int blocks = 1; if (argc > 2) blocks = atoi(argv[2]); if (blocks < 1) blocks = 1; // Third command line arg is the number of threads int threads = 128; if (argc > 3) threads = atoi(argv[3]); if (threads > 1024 || threads < 32) threads = 128; threads &= -32; // Forth command line arg: double fops = 1.0; int lanes = 1; if (argc > 4) { if (internalTiming) { // The number of lanes to print for each warp lanes = atoi(argv[4]); if (lanes > 32 || lanes < 1) lanes = 1; } else // The number of floating point operations in a full kernel launch fops = atof(argv[4]); } // Fifth command line arg is the repeat count for benchmarking int repeat = 1; if (argc > 5) repeat = atoi(argv[5]); if (repeat > 1000 || repeat < 1) repeat = 1; // threads = total number of threads size_t size = sizeof(int) * threads * blocks; // Setup our input and output buffers int* dataIn = (int*)malloc(size); int* dataOut = (int*)malloc(size); int* clocks = (int*)malloc(size); memset(dataIn, 0, size); CUmodule hModule; CUfunction hKernel; CUevent hStart, hStop; CUdeviceptr devIn, devOut, devClocks; // Init our context and device memory buffers CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); CUDA_CHECK( cuMemAlloc(&devIn, size) ); CUDA_CHECK( cuMemAlloc(&devOut, size) ); CUDA_CHECK( cuMemAlloc(&devClocks, size) ); CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) ); CUDA_CHECK( cuMemsetD8(devOut, 0, size) ); CUDA_CHECK( cuMemsetD8(devClocks, 0, size) ); CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); // Load our kernel CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") ); CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") ); // Setup the params void* params[] = { &devOut, &devClocks, &devIn }; float ms = 0; // Warm up the clock (unless under nsight) if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER for (int i = 0; i < repeat; i++) CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); // Launch the kernel CUDA_CHECK( cuEventRecord(hStart, NULL) ); //CUDA_CHECK( cuProfilerStart() ); CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); //CUDA_CHECK( cuProfilerStop() ); CUDA_CHECK( cuEventRecord(hStop, NULL) ); CUDA_CHECK( cuEventSynchronize(hStop) ); CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); //CUDA_CHECK( cuCtxSynchronize() ); // Get back our results from each kernel CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) ); CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) ); // Cleanup and shutdown of cuda CUDA_CHECK( cuEventDestroy(hStart) ); CUDA_CHECK( cuEventDestroy(hStop) ); CUDA_CHECK( cuModuleUnload(hModule) ); CUDA_CHECK( cuMemFree(devIn) ); CUDA_CHECK( cuMemFree(devOut) ); CUDA_CHECK( cuMemFree(devClocks) ); CUDA_CHECK( cuCtxDestroy(hContext) ); hContext = 0; // When using just one block, print out the internal timing data if (internalTiming) { int count = 0, total = 0, min = 999999, max = 0; int* clocks_p = clocks; int* dataOut_p = dataOut; // Loop over and print results for (int blk = 0; blk < blocks; blk++) { float *fDataOut = reinterpret_cast<float*>(dataOut_p); for(int tid = 0; tid < threads; tid += 32) { // Sometimes we want data on each thread, sometimes just one sample per warp is fine for (int lane = 0; lane < lanes; lane++) printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u count++; total += clocks_p[tid]; if (clocks_p[tid] < min) min = clocks_p[tid]; if (clocks_p[tid] > max) max = clocks_p[tid]; } clocks_p += threads; dataOut_p += threads; } printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max); } else { // For more than one block we're testing throughput and want external timing data printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0)); } // And free up host memory free(dataIn); free(dataOut); free(clocks); return 0; }
void device_t<CUDA>::waitFor(tag tag_){ cuEventSynchronize(tag_.cuEvent); }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }