float ApexCudaProfileSession::flushProfileInfo(ProfileData& pd) { CUevent start = (CUevent)pd.start; CUevent stop = (CUevent)pd.stop; uint32_t op = 1; float startTf = 0.f, stopTf = 0.f; uint64_t startT = 0, stopT = 0; CUT_SAFE_CALL(cuEventSynchronize(start)); CUT_SAFE_CALL(cuEventElapsedTime(&startTf, (CUevent)mTimer, start)); startT = static_cast<uint64_t>(startTf * mManager->mTimeFormat) ; mMemBuf.write(&op, sizeof(op)); mMemBuf.write(&startT, sizeof(startT)); mMemBuf.write(&pd.id, sizeof(pd.id)); op = 2; CUT_SAFE_CALL(cuEventSynchronize((CUevent)stop)); CUT_SAFE_CALL(cuEventElapsedTime(&stopTf, (CUevent)mTimer, (CUevent)stop)); stopT = static_cast<uint64_t>(stopTf * mManager->mTimeFormat); mMemBuf.write(&op, sizeof(op)); mMemBuf.write(&stopT, sizeof(stopT)); mMemBuf.write(&pd.id, sizeof(pd.id)); CUT_SAFE_CALL(cuEventDestroy((CUevent)start)); CUT_SAFE_CALL(cuEventDestroy((CUevent)stop)); mFrameStart = PxMin(mFrameStart, startTf); mFrameFinish = PxMax(mFrameFinish, stopTf); return stopTf - startTf; }
WEAK void halide_release(void *user_context) { // Do not do any of this if there is not context set. E.g. // if halide_release is called and no CUDA calls have been made. if (cuda_ctx_ptr != NULL) { // It's possible that this is being called from the destructor of // a static variable, in which case the driver may already be // shutting down. For this reason we allow the deinitialized // error. CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } cuda_ctx_ptr = NULL; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
WEAK void halide_release() { // CUcontext ignore; // TODO: this is for timing; bad for release-mode performance CHECK_CALL( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
WEAK void halide_release() { // It's possible that this is being called from the destructor of // a static variable, in which case the driver may already be // shutting down. For this reason we allow the deinitialized // error. CHECK_CALL_DEINIT_OK( cuCtxSynchronize(), "cuCtxSynchronize on exit" ); // Only destroy the context if we own it if (weak_cuda_ctx) { CHECK_CALL_DEINIT_OK( cuCtxDestroy(weak_cuda_ctx), "cuCtxDestroy on exit" ); weak_cuda_ctx = 0; } // Destroy the events if (__start) { cuEventDestroy(__start); cuEventDestroy(__end); __start = __end = 0; } // Unload the module if (__mod) { CHECK_CALL_DEINIT_OK( cuModuleUnload(__mod), "cuModuleUnload" ); __mod = 0; } //CHECK_CALL( cuCtxPopCurrent(&ignore), "cuCtxPopCurrent" ); }
static void deallocate(gpudata *d) { cuda_enter(d->ctx); cuEventDestroy(d->rev); cuEventDestroy(d->wev); cuda_exit(d->ctx); CLEAR(d); free(d); }
/** * This measures the overhead in launching a kernel function on each GPU in the * system. * * It does this by executing a small kernel (copying 1 value in global memory) a * very large number of times and taking the average execution time. This * program uses the CUDA driver API. */ int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); float x = 5.0f; for (int d = 0; d < count; d++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); CUdeviceptr in, out; CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float))); CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float))); CUmodule module; CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes)); CUfunction function; CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel")); void * params[] = { &in, &out }; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, 0)); CU_ERROR_CHECK(cuEventCreate(&stop, 0)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (int i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuMemFree(in)); CU_ERROR_CHECK(cuMemFree(out)); fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS)); CU_ERROR_CHECK(cuModuleUnload(module)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
ApexCudaProfileSession::~ApexCudaProfileSession() { if (mTimer) { CUT_SAFE_CALL(cuEventDestroy((CUevent)mTimer)); } }
void GPUInterface::ResizeStreamCount(int newStreamCount) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::ResizeStreamCount\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); SAFE_CUDA(cuCtxSynchronize()); if (cudaStreams != NULL) { for(int i=0; i<numStreams; i++) { if (cudaStreams[i] != NULL) SAFE_CUDA(cuStreamDestroy(cudaStreams[i])); } free(cudaStreams); } if (cudaEvents != NULL) { for(int i=0; i<numStreams; i++) { if (cudaEvents[i] != NULL) SAFE_CUDA(cuEventDestroy(cudaEvents[i])); } free(cudaEvents); } if (newStreamCount == 1) { numStreams = 1; cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); cudaStreams[0] = NULL; CUevent event; for(int i=0; i<2; i++) { SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } } else { numStreams = newStreamCount; if (numStreams > BEAGLE_STREAM_COUNT) { numStreams = BEAGLE_STREAM_COUNT; } cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); CUstream stream; cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); CUevent event; for(int i=0; i<numStreams; i++) { SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT)); cudaStreams[i] = stream; SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[numStreams] = event; } SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::ResizeStreamCount\n"); #endif }
AsyncTimer::~AsyncTimer() { for ( std::vector<CUevent>::const_iterator it = m_entries.begin() ; it != m_entries.end() ; it++ ) { cuEventDestroy( (CUevent)*it ); } }
static gpudata *cuda_alloc(void *c, size_t size, void *data, int flags, int *ret) { gpudata *res; cuda_context *ctx = (cuda_context *)c; int fl = CU_EVENT_DISABLE_TIMING; if ((flags & GA_BUFFER_INIT) && data == NULL) FAIL(NULL, GA_VALUE_ERROR); if ((flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) == (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) FAIL(NULL, GA_VALUE_ERROR); /* TODO: figure out how to make this work */ if (flags & GA_BUFFER_HOST) FAIL(NULL, GA_DEVSUP_ERROR); res = malloc(sizeof(*res)); if (res == NULL) FAIL(NULL, GA_SYS_ERROR); res->refcnt = 1; res->sz = size; res->flags = flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY); cuda_enter(ctx); if (ctx->err != CUDA_SUCCESS) { free(res); FAIL(NULL, GA_IMPL_ERROR); } if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; ctx->err = cuEventCreate(&res->ev, fl); if (ctx->err != CUDA_SUCCESS) { free(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } if (size == 0) size = 1; ctx->err = cuMemAlloc(&res->ptr, size); if (ctx->err != CUDA_SUCCESS) { cuEventDestroy(res->ev); free(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; if (flags & GA_BUFFER_INIT) { ctx->err = cuMemcpyHtoD(res->ptr, data, size); if (ctx->err != CUDA_SUCCESS) { cuda_free(res); FAIL(NULL, GA_IMPL_ERROR) }
void CudaModule::staticDeinit(void) { if (!s_inited) { return; } s_inited = false; if (s_startEvent) { checkError("cuEventDestroy", cuEventDestroy(s_startEvent)); } s_startEvent = NULL; if (s_endEvent) { checkError("cuEventDestroy", cuEventDestroy(s_endEvent)); } s_endEvent = NULL; if (s_context) { checkError("cuCtxDestroy", cuCtxDestroy(s_context)); } s_context = NULL; s_device = 0; }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }
static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size) { gpudata *res; int fl = CU_EVENT_DISABLE_TIMING; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->refcnt = 0; res->sz = size; res->flags = 0; cuda_enter(ctx); if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; ctx->err = cuEventCreate(&res->rev, fl); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); free(res); return NULL; } ctx->err = cuEventCreate(&res->wev, fl); if (ctx->err != CUDA_SUCCESS) { cuEventDestroy(res->rev); cuda_exit(ctx); free(res); return NULL; } cuda_exit(ctx); res->ptr = ptr; res->next = NULL; res->ctx = ctx; TAG_BUF(res); return res; }
int main(int argc, char * argv[]) { CBlasTranspose transA, transB; size_t m, n, k; int d = 0; if (argc < 6 || argc > 7) { fprintf(stderr, "Usage: %s <transA> <transB> <m> <n> <k> [device]\n" "where:\n" " transA and transB are 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " m, n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char t; if (sscanf(argv[1], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (t) { case 'N': case 'n': transA = CBlasNoTrans; break; case 'T': case 't': transA = CBlasTrans; break; case 'C': case 'c': transA = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': transB = CBlasNoTrans; break; case 'T': case 't': transB = CBlasTrans; break; case 'C': case 'c': transB = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[3], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (sscanf(argv[5], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (argc > 6) { if (sscanf(argv[6], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } } srand(0); float complex alpha, beta, * A, * B, * C, * refC; CUdeviceptr dA, dB, dC, dD; size_t lda, ldb, ldc, dlda, dldb, dldc, dldd; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; beta = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; if (transA == CBlasNoTrans) { lda = (m + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(float complex), k, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), m * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * m * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(float complex), m, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), k * sizeof(float complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } if (transB == CBlasNoTrans) { ldb = (k + 1u) & ~1u; if ((B = malloc(ldb * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, k * sizeof(float complex), n, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), k * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { ldb = (n + 1u) & ~1u; if ((B = malloc(ldb * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, n * sizeof(float complex), k, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), n * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (m + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, m * sizeof(float complex), n, sizeof(float complex))); dldc /= sizeof(float complex); CU_ERROR_CHECK(cuMemAllocPitch(&dD, &dldd, m * sizeof(float complex), n, sizeof(float complex))); dldd /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refC[j * ldc + i] = C[j * ldc + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); cgemm_ref(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, refC, ldc); CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dD, NULL, dldd * sizeof(float complex), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); float rdiff = 0.0f, idiff = 0.0f; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { float d = fabsf(crealf(C[j * ldc + i]) - crealf(refC[j * ldc + i])); if (d > rdiff) rdiff = d; d = fabsf(cimagf(C[j * ldc + i]) - cimagf(refC[j * ldc + i])); if (d > idiff) idiff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = k * 6 + (k - 1) * 2; // k multiplies and k - 1 adds per element if (alpha != 1.0f + 0.0f * I) flops += 6; // additional multiply by alpha if (beta != 0.0f + 0.0f * I) flops += 8; // additional multiply and add by beta float error = (float)flops * 2.0f * FLT_EPSILON; // maximum per element error flops *= m * n; // m * n elements bool passed = (rdiff <= error) && (idiff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuMemFree(dD)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
static void event_gc (bool memmap_lockable) { struct ptx_event *ptx_event = ptx_events; struct nvptx_thread *nvthd = nvptx_thread (); pthread_mutex_lock (&ptx_event_lock); while (ptx_event != NULL) { CUresult r; struct ptx_event *e = ptx_event; ptx_event = ptx_event->next; if (e->ord != nvthd->ptx_dev->ord) continue; r = cuEventQuery (*e->evt); if (r == CUDA_SUCCESS) { CUevent *te; te = e->evt; switch (e->type) { case PTX_EVT_MEM: case PTX_EVT_SYNC: break; case PTX_EVT_KNL: map_pop (e->addr); break; case PTX_EVT_ASYNC_CLEANUP: { /* The function gomp_plugin_async_unmap_vars needs to claim the memory-map splay tree lock for the current device, so we can't call it when one of our callers has already claimed the lock. In that case, just delay the GC for this event until later. */ if (!memmap_lockable) continue; GOMP_PLUGIN_async_unmap_vars (e->addr); } break; } cuEventDestroy (*te); free ((void *)te); if (ptx_events == e) ptx_events = ptx_events->next; else { struct ptx_event *e_ = ptx_events; while (e_->next != e) e_ = e_->next; e_->next = e_->next->next; } free (e); } } pthread_mutex_unlock (&ptx_event_lock); }
int main(int argc, char * argv[]) { CBlasUplo uplo; CBlasTranspose trans; size_t n, k; int d = 0; if (argc < 5 || argc > 6) { fprintf(stderr, "Usage: %s <uplo> <trans> <n> <k> [device]\n" "where:\n" " uplo is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n" " trans are 'n' or 'N' for CBlasNoTrans or 't' or 'T' for CBlasTrans\n" " n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char u; if (sscanf(argv[1], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1; } char t; if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': trans = CBlasNoTrans; break; case 'T': case 't': trans = CBlasTrans; break; case 'C': case 'c': trans = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 2; } if (sscanf(argv[3], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (argc > 5) { if (sscanf(argv[5], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } } srand(0); double alpha, beta, * A, * C, * refC; CUdeviceptr dA, dC; size_t lda, ldc, dlda, dldc; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = (double)rand() / (double)RAND_MAX; beta = (double)rand() / (double)RAND_MAX; if (trans == CBlasNoTrans) { lda = (n + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), k, sizeof(double))); dlda /= sizeof(double); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(double), n, sizeof(double))); dlda /= sizeof(double); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), k * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (n + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(double))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(double))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, n * sizeof(double), n, sizeof(double))); dldc /= sizeof(double); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) refC[j * ldc + i] = C[j * ldc + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); dsyrk_ref(uplo, trans, n, k, alpha, A, lda, beta, refC, ldc); CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); double diff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) { double d = fabs(C[j * ldc + i] - refC[j * ldc + i]); if (d > diff) diff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = 2 * k - 1; // k multiplies and k - 1 adds per element if (alpha != 1.0) flops += 1; // additional multiply by alpha if (beta != 0.0) flops += 2; // additional multiply and add by beta double error = (double)flops * 2.0 * DBL_EPSILON; // maximum per element error flops *= n * (n + 1) / 2; // n(n + 1) / 2 elements bool passed = (diff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
int main(int argc, char * argv[]) { CBlasSide side; CBlasUplo uplo; CBlasTranspose trans; CBlasDiag diag; size_t m, n; int d = 0; if (argc < 7 || argc > 8) { fprintf(stderr, "Usage: %s <side> <uplo> <trans> <diag> <m> <n> [device]\n" "where:\n" " side is 'l' or 'L' for CBlasLeft and 'r' or 'R' for CBlasRight\n" " uplo is 'u' or 'U' for CBlasUpper and 'l' or 'L' for CBlasLower\n" " trans is 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " diag is 'n' or 'N' for CBlasNonUnit and 'u' or 'U' for CBlasUnit\n" " m and n are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char s; if (sscanf(argv[1], "%c", &s) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (s) { case 'L': case 'l': side = CBlasLeft; break; case 'R': case 'r': side = CBlasRight; break; default: fprintf(stderr, "Unknown side '%c'\n", s); return 1; } char u; if (sscanf(argv[2], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 2; } char t; if (sscanf(argv[3], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[3]); return 3; } switch (t) { case 'N': case 'n': trans = CBlasNoTrans; break; case 'T': case 't': trans = CBlasTrans; break; case 'C': case 'c': trans = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 3; } char di; if (sscanf(argv[4], "%c", &di) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[4]); return 4; } switch (di) { case 'N': case 'n': diag = CBlasNonUnit; break; case 'U': case 'u': diag = CBlasUnit; break; default: fprintf(stderr, "Unknown diag '%c'\n", t); return 4; } if (sscanf(argv[5], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (sscanf(argv[6], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } if (argc > 7) { if (sscanf(argv[7], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[7]); return 7; } } srand(0); double complex alpha, * A, * B, * refB; CUdeviceptr dA, dB, dX; size_t lda, ldb, dlda, dldb, dldx; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; if (side == CBlasLeft) { lda = m; if ((A = malloc(lda * m * sizeof(double complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(double complex), m, sizeof(double complex))); dlda /= sizeof(double complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex), m * sizeof(double complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = n; if ((A = malloc(lda * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double complex), n, sizeof(double complex))); dlda /= sizeof(double complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex), n * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldb = m; if ((B = malloc(ldb * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -3; } if ((refB = malloc(ldb * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate refB\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, m * sizeof(double complex), n, sizeof(double complex))); dldb /= sizeof(double complex); CU_ERROR_CHECK(cuMemAllocPitch(&dX, &dldx, m * sizeof(double complex), n, sizeof(double complex))); dldx /= sizeof(double complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refB[j * ldb + i] = B[j * ldb + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(double complex), m * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); ztrmm_ref(side, uplo, trans, diag, m, n, alpha, A, lda, refB, ldb); CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dX, NULL, dldx * sizeof(double complex), 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex), m * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); bool passed = true; double rdiff = 0.0, idiff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { double d = fabs(creal(B[j * ldb + i]) - creal(refB[j * ldb + i])); if (d > rdiff) rdiff = d; double c = fabs(cimag(B[j * ldb + i]) - cimag(refB[j * ldb + i])); if (c > idiff) idiff = c; size_t flops; if (side == CBlasLeft) flops = 2 * i + 1; else flops = 2 * j + 1; if (diag == CBlasNonUnit) flops++; flops *= 3; if (d > (double)flops * 2.0 * DBL_EPSILON || c > (double)flops * 2.0 * DBL_EPSILON) passed = false; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); const size_t flops = (side == CBlasLeft) ? (6 * (n * m * (m + 1) / 2) + 2 * (n * m * (m - 1) / 2)) : (6 * (m * n * (n + 1) / 2) + 2 * (m * n * (n - 1) / 2)); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(refB); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dX)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }
int main(int argc, char* argv[]) { //int iTest = 2896; //while (iTest < 0x7fff) //{ // int iResult = iTest * iTest; // float fTest = (float)iTest; // int fResult = (int)(fTest * fTest); // printf("i*i:%08x f*f:%08x\n", iResult, fResult); // iTest += 0x0800; //} //exit(0); char deviceName[32]; int devCount, ordinal, major, minor; CUdevice hDevice; // Initialize the Driver API and find a device CUDA_CHECK( cuInit(0) ); CUDA_CHECK( cuDeviceGetCount(&devCount) ); for (ordinal = 0; ordinal < devCount; ordinal++) { CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); if (major >= 5 && minor >= 2) { printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); break; } } if (ordinal == devCount) { printf("No compute 5.0 device found, exiting.\n"); exit(EXIT_FAILURE); } // First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing int internalTiming = 1; if (argc > 1) internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0; // Second command line arg is the number of blocks int blocks = 1; if (argc > 2) blocks = atoi(argv[2]); if (blocks < 1) blocks = 1; // Third command line arg is the number of threads int threads = 128; if (argc > 3) threads = atoi(argv[3]); if (threads > 1024 || threads < 32) threads = 128; threads &= -32; // Forth command line arg: double fops = 1.0; int lanes = 1; if (argc > 4) { if (internalTiming) { // The number of lanes to print for each warp lanes = atoi(argv[4]); if (lanes > 32 || lanes < 1) lanes = 1; } else // The number of floating point operations in a full kernel launch fops = atof(argv[4]); } // Fifth command line arg is the repeat count for benchmarking int repeat = 1; if (argc > 5) repeat = atoi(argv[5]); if (repeat > 1000 || repeat < 1) repeat = 1; // threads = total number of threads size_t size = sizeof(int) * threads * blocks; // Setup our input and output buffers int* dataIn = (int*)malloc(size); int* dataOut = (int*)malloc(size); int* clocks = (int*)malloc(size); memset(dataIn, 0, size); CUmodule hModule; CUfunction hKernel; CUevent hStart, hStop; CUdeviceptr devIn, devOut, devClocks; // Init our context and device memory buffers CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); CUDA_CHECK( cuMemAlloc(&devIn, size) ); CUDA_CHECK( cuMemAlloc(&devOut, size) ); CUDA_CHECK( cuMemAlloc(&devClocks, size) ); CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) ); CUDA_CHECK( cuMemsetD8(devOut, 0, size) ); CUDA_CHECK( cuMemsetD8(devClocks, 0, size) ); CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); // Load our kernel CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") ); CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") ); // Setup the params void* params[] = { &devOut, &devClocks, &devIn }; float ms = 0; // Warm up the clock (unless under nsight) if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER for (int i = 0; i < repeat; i++) CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); // Launch the kernel CUDA_CHECK( cuEventRecord(hStart, NULL) ); //CUDA_CHECK( cuProfilerStart() ); CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); //CUDA_CHECK( cuProfilerStop() ); CUDA_CHECK( cuEventRecord(hStop, NULL) ); CUDA_CHECK( cuEventSynchronize(hStop) ); CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); //CUDA_CHECK( cuCtxSynchronize() ); // Get back our results from each kernel CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) ); CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) ); // Cleanup and shutdown of cuda CUDA_CHECK( cuEventDestroy(hStart) ); CUDA_CHECK( cuEventDestroy(hStop) ); CUDA_CHECK( cuModuleUnload(hModule) ); CUDA_CHECK( cuMemFree(devIn) ); CUDA_CHECK( cuMemFree(devOut) ); CUDA_CHECK( cuMemFree(devClocks) ); CUDA_CHECK( cuCtxDestroy(hContext) ); hContext = 0; // When using just one block, print out the internal timing data if (internalTiming) { int count = 0, total = 0, min = 999999, max = 0; int* clocks_p = clocks; int* dataOut_p = dataOut; // Loop over and print results for (int blk = 0; blk < blocks; blk++) { float *fDataOut = reinterpret_cast<float*>(dataOut_p); for(int tid = 0; tid < threads; tid += 32) { // Sometimes we want data on each thread, sometimes just one sample per warp is fine for (int lane = 0; lane < lanes; lane++) printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u count++; total += clocks_p[tid]; if (clocks_p[tid] < min) min = clocks_p[tid]; if (clocks_p[tid] > max) max = clocks_p[tid]; } clocks_p += threads; dataOut_p += threads; } printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max); } else { // For more than one block we're testing throughput and want external timing data printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0)); } // And free up host memory free(dataIn); free(dataOut); free(clocks); return 0; }
int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); for (int i = 0; i < count; i++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, i)); int memoryClockRate, globalMemoryBusWidth; CU_ERROR_CHECK(cuDeviceGetAttribute(&memoryClockRate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); CU_ERROR_CHECK(cuDeviceGetAttribute(&globalMemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); // Calculate pin bandwidth in bytes/sec (clock rate is actual in kHz, memory is DDR so multiply clock rate by 2.e3 to get effective clock rate in Hz) double pinBandwidth = memoryClockRate * 2.e3 * (globalMemoryBusWidth / CHAR_BIT); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); fprintf(stdout, "Device %d (pin bandwidth %6.2f GB/s):\n", i, pinBandwidth / (1 << 30)); CUDA_MEMCPY2D copy; copy.srcMemoryType = CU_MEMORYTYPE_DEVICE; copy.dstMemoryType = CU_MEMORYTYPE_DEVICE; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_DEFAULT)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_DEFAULT)); float time; // Calculate aligned copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\taligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate misaligned copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE; copy.Height = 1; copy.srcXInBytes = j; copy.srcY = 0; copy.dstXInBytes = j; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t j = 0; j < ITERATIONS; j++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tmisaligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-2 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 2; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 2; copy.dstPitch *= 2; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-2 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-10 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 10; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 10; copy.dstPitch *= 10; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-10 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-1000 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 1000; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 1000; copy.dstPitch *= 1000; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t j = 0; j < ITERATIONS; j++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-1000 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
int main(int argc, char * argv[]) { CBlasUplo uplo; size_t n; int d = 0; if (argc < 3 || argc > 4) { fprintf(stderr, "Usage: %s <uplo> <n>\n" "where:\n" " uplo is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n" " n is the size of the matrix\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char u; if (sscanf(argv[1], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1; } if (sscanf(argv[2], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[2]); return 2; } if (argc > 3) { if (sscanf(argv[3], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } } srand(0); double * A, * refA; CUdeviceptr dA; size_t lda, dlda; long info, rInfo; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CULAPACKhandle handle; CU_ERROR_CHECK(cuLAPACKCreate(&handle)); lda = (n + 1u) & ~1u; if ((A = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } if ((refA = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate refA\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), n, sizeof(double))); dlda /= sizeof(double); if (dlatmc(n, 2.0, A, lda) != 0) { fputs("Unable to initialise A\n", stderr); return -1; } // dpotrf(uplo, n, A, lda, &info); // if (info != 0) { // fputs("Failed to compute Cholesky decomposition of A\n", stderr); // return (int)info; // } for (size_t j = 0; j < n; j++) memcpy(&refA[j * lda], &A[j * lda], n * sizeof(double)); CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); dlauum_ref(uplo, n, refA, lda, &rInfo); CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); bool passed = (info == rInfo); double diff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) { double d = fabs(A[j * lda + i] - refA[j * lda + i]); if (d > diff) diff = d; } } // Set A to identity so that repeated applications of the cholesky // decomposition while benchmarking do not exit early due to // non-positive-definite-ness. for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (i == j) ? 1.0 : 0.0; } copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); const size_t flops = ((n * n * n) / 3) + ((n * n) / 2) + (n / 6); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(refA); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuLAPACKDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }