void CudaModule::staticInit(void) { if (s_inited) { return; } s_inited = true; s_available = false; checkError("cuInit", cuInit(0)); s_available = true; s_device = selectDevice(); printDeviceInfo(s_device); U32 flags = 0; flags |= CU_CTX_SCHED_SPIN; // use sync() if you want to yield #if (CUDA_VERSION >= 2030) if (getDriverVersion() >= 23) { // reduce launch overhead with large localmem flags |= CU_CTX_LMEM_RESIZE_TO_MAX; } #endif // OpenGL & window context must have been initialized ! checkError("cuGLCtxCreate", cuGLCtxCreate( &s_context, flags, s_device)); checkError("cuEventCreate", cuEventCreate(&s_startEvent, 0)); checkError("cuEventCreate", cuEventCreate(&s_endEvent, 0)); }
void GPUInterface::ResizeStreamCount(int newStreamCount) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::ResizeStreamCount\n"); #endif SAFE_CUDA(cuCtxPushCurrent(cudaContext)); SAFE_CUDA(cuCtxSynchronize()); if (cudaStreams != NULL) { for(int i=0; i<numStreams; i++) { if (cudaStreams[i] != NULL) SAFE_CUDA(cuStreamDestroy(cudaStreams[i])); } free(cudaStreams); } if (cudaEvents != NULL) { for(int i=0; i<numStreams; i++) { if (cudaEvents[i] != NULL) SAFE_CUDA(cuEventDestroy(cudaEvents[i])); } free(cudaEvents); } if (newStreamCount == 1) { numStreams = 1; cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); cudaStreams[0] = NULL; CUevent event; for(int i=0; i<2; i++) { SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } } else { numStreams = newStreamCount; if (numStreams > BEAGLE_STREAM_COUNT) { numStreams = BEAGLE_STREAM_COUNT; } cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); CUstream stream; cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); CUevent event; for(int i=0; i<numStreams; i++) { SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT)); cudaStreams[i] = stream; SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[numStreams] = event; } SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::ResizeStreamCount\n"); #endif }
/** * This measures the overhead in launching a kernel function on each GPU in the * system. * * It does this by executing a small kernel (copying 1 value in global memory) a * very large number of times and taking the average execution time. This * program uses the CUDA driver API. */ int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); float x = 5.0f; for (int d = 0; d < count; d++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); CUdeviceptr in, out; CU_ERROR_CHECK(cuMemAlloc(&in, sizeof(float))); CU_ERROR_CHECK(cuMemAlloc(&out, sizeof(float))); CU_ERROR_CHECK(cuMemcpyHtoD(in, &x, sizeof(float))); CUmodule module; CU_ERROR_CHECK(cuModuleLoadData(&module, imageBytes)); CUfunction function; CU_ERROR_CHECK(cuModuleGetFunction(&function, module, "kernel")); void * params[] = { &in, &out }; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, 0)); CU_ERROR_CHECK(cuEventCreate(&stop, 0)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (int i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuLaunchKernel(function, 1, 1, 1, 1, 1, 1, 0, 0, params, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuMemFree(in)); CU_ERROR_CHECK(cuMemFree(out)); fprintf(stdout, "Device %d: %fms\n", d, (time / (double)ITERATIONS)); CU_ERROR_CHECK(cuModuleUnload(module)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
gpudata *cuda_make_buf(void *c, CUdeviceptr p, size_t sz) { cuda_context *ctx = (cuda_context *)c; gpudata *res; int flags = CU_EVENT_DISABLE_TIMING; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->refcnt = 1; cuda_enter(ctx); if (ctx->err != CUDA_SUCCESS) { free(res); return NULL; } res->ptr = p; if (ctx->flags & GA_CTX_MULTI_THREAD) flags |= CU_EVENT_BLOCKING_SYNC; ctx->err = cuEventCreate(&res->ev, flags); if (ctx->err != CUDA_SUCCESS) { free(res); cuda_exit(ctx); return NULL; } res->sz = sz; res->flags = DONTFREE; res->ctx = ctx; ctx->refcnt++; cuda_exit(ctx); TAG_BUF(res); return res; }
tag device_t<CUDA>::tagStream(){ tag ret; cuEventCreate(&(ret.cuEvent), CU_EVENT_DEFAULT); cuEventRecord(ret.cuEvent, 0); return ret; }
void ApexCudaProfileSession::onFuncStart(uint32_t id, void* stream) { mLock.lock(); CUevent start; CUevent stop; CUT_SAFE_CALL(cuEventCreate(&start, CU_EVENT_DEFAULT)); CUT_SAFE_CALL(cuEventCreate(&stop, CU_EVENT_DEFAULT)); CUT_SAFE_CALL(cuEventRecord(start, (CUstream)stream)); ProfileData data; data.id = id; data.start = start; data.stop = stop; mProfileDataList.pushBack(data); }
void AsyncTimer::TimerGrow( unsigned int timers ) { size_t prevSize = m_entries.size(); m_entries.resize( prevSize + timers, 0 ); for ( size_t i = prevSize; i < m_entries.size(); i++ ) { CHECK_CUDA_CALL( cuEventCreate( (CUevent*)&m_entries[i], CU_EVENT_BLOCKING_SYNC ), "Failed to create CUDA event" ); } }
static gpudata *cuda_alloc(void *c, size_t size, void *data, int flags, int *ret) { gpudata *res; cuda_context *ctx = (cuda_context *)c; int fl = CU_EVENT_DISABLE_TIMING; if ((flags & GA_BUFFER_INIT) && data == NULL) FAIL(NULL, GA_VALUE_ERROR); if ((flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) == (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) FAIL(NULL, GA_VALUE_ERROR); /* TODO: figure out how to make this work */ if (flags & GA_BUFFER_HOST) FAIL(NULL, GA_DEVSUP_ERROR); res = malloc(sizeof(*res)); if (res == NULL) FAIL(NULL, GA_SYS_ERROR); res->refcnt = 1; res->sz = size; res->flags = flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY); cuda_enter(ctx); if (ctx->err != CUDA_SUCCESS) { free(res); FAIL(NULL, GA_IMPL_ERROR); } if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; ctx->err = cuEventCreate(&res->ev, fl); if (ctx->err != CUDA_SUCCESS) { free(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } if (size == 0) size = 1; ctx->err = cuMemAlloc(&res->ptr, size); if (ctx->err != CUDA_SUCCESS) { cuEventDestroy(res->ev); free(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; if (flags & GA_BUFFER_INIT) { ctx->err = cuMemcpyHtoD(res->ptr, data, size); if (ctx->err != CUDA_SUCCESS) { cuda_free(res); FAIL(NULL, GA_IMPL_ERROR) }
static gpudata *new_gpudata(cuda_context *ctx, CUdeviceptr ptr, size_t size) { gpudata *res; int fl = CU_EVENT_DISABLE_TIMING; res = malloc(sizeof(*res)); if (res == NULL) return NULL; res->refcnt = 0; res->sz = size; res->flags = 0; cuda_enter(ctx); if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; ctx->err = cuEventCreate(&res->rev, fl); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); free(res); return NULL; } ctx->err = cuEventCreate(&res->wev, fl); if (ctx->err != CUDA_SUCCESS) { cuEventDestroy(res->rev); cuda_exit(ctx); free(res); return NULL; } cuda_exit(ctx); res->ptr = ptr; res->next = NULL; res->ctx = ctx; TAG_BUF(res); return res; }
SEXP R_auto_cuEventCreate(SEXP r_Flags) { SEXP r_ans = R_NilValue; CUevent phEvent; unsigned int Flags = REAL(r_Flags)[0]; CUresult ans; ans = cuEventCreate(& phEvent, Flags); if(ans) return(R_cudaErrorInfo(ans)); r_ans = R_createRef(phEvent, "CUevent") ; return(r_ans); }
static void nvptx_wait_all_async (int async) { CUresult r; struct ptx_stream *waiting_stream, *other_stream; CUevent *e; struct nvptx_thread *nvthd = nvptx_thread (); pthread_t self = pthread_self (); /* The stream doing the waiting. This could be the first mention of the stream, so create it if necessary. */ waiting_stream = select_stream_for_async (async, pthread_self (), true, NULL); /* Launches on the null stream already block on other streams in the context. */ if (!waiting_stream || waiting_stream == nvthd->ptx_dev->null_stream) return; event_gc (true); pthread_mutex_lock (&nvthd->ptx_dev->stream_lock); for (other_stream = nvthd->ptx_dev->active_streams; other_stream != NULL; other_stream = other_stream->next) { if (!other_stream->multithreaded && !pthread_equal (other_stream->host_thread, self)) continue; e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); /* Record an event on the waited-for stream. */ r = cuEventRecord (*e, other_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_SYNC, e, NULL); r = cuStreamWaitEvent (waiting_stream->stream, *e, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); } pthread_mutex_unlock (&nvthd->ptx_dev->stream_lock); }
int main() { CUresult result; result = cuInit(0); CUdevice device; result = cuDeviceGet(&device, 0); CUcontext ctx; result = cuCtxCreate(&ctx, 0, device); CUmodule module; result = cuModuleLoad(&module, "cuda-shift-throughput.cubin"); CUfunction kernel; result = cuModuleGetFunction(&kernel, module, "kernel"); int block; result = cuFuncGetAttribute(&block, CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, kernel); int grid = 1024 * 1024; CUevent event[2]; for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventCreate(&event[i], 0); } result = cuEventRecord(event[0], 0); result = cuLaunchKernel(kernel, grid, 1, 1, block, 1, 1, 0, 0, 0, 0); result = cuEventRecord(event[1], 0); result = cuEventSynchronize(event[1]); float time; result = cuEventElapsedTime(&time, event[0], event[1]); int gpuclock; result = cuDeviceGetAttribute(&gpuclock, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, device); int gpump; result = cuDeviceGetAttribute(&gpump, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, device); std::printf("Clock: %d KHz, # of MPs: %d\n", gpuclock, gpump); std::printf("Elapsed Time: %f milliseconds\n", time); std::printf("# of Threads: %d, # of SHLs : %lld\n", block, 1024ll * block * grid); std::printf("Throughput: %f\n", 1024.0 * block * grid / ((double) gpump * gpuclock * time)); for (ptrdiff_t i = 0; i < 2; ++i) { result = cuEventDestroy(event[i]); } result = cuModuleUnload(module); result = cuCtxDestroy(ctx); return 0; }
void GOMP_OFFLOAD_openacc_register_async_cleanup (void *targ_mem_desc) { CUevent *e; CUresult r; struct nvptx_thread *nvthd = nvptx_thread (); e = (CUevent *) GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); r = cuEventRecord (*e, nvthd->current_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_ASYNC_CLEANUP, e, targ_mem_desc); }
static void nvptx_wait_async (int async1, int async2) { CUresult r; CUevent *e; struct ptx_stream *s1, *s2; pthread_t self = pthread_self (); /* The stream that is waiting (rather than being waited for) doesn't necessarily have to exist already. */ s2 = select_stream_for_async (async2, self, true, NULL); s1 = select_stream_for_async (async1, self, false, NULL); if (!s1) GOMP_PLUGIN_fatal ("invalid async 1\n"); if (s1 == s2) GOMP_PLUGIN_fatal ("identical parameters"); e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); event_gc (true); r = cuEventRecord (*e, s1->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_SYNC, e, NULL); r = cuStreamWaitEvent (s2->stream, *e, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamWaitEvent error: %s", cuda_error (r)); }
static void * nvptx_dev2host (void *h, const void *d, size_t s) { CUresult r; CUdeviceptr pb; size_t ps; struct nvptx_thread *nvthd = nvptx_thread (); if (!s) return 0; if (!d) GOMP_PLUGIN_fatal ("invalid device address"); r = cuMemGetAddressRange (&pb, &ps, (CUdeviceptr)d); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemGetAddressRange error: %s", cuda_error (r)); if (!pb) GOMP_PLUGIN_fatal ("invalid device address"); if (!h) GOMP_PLUGIN_fatal ("invalid host address"); if (d == h) GOMP_PLUGIN_fatal ("invalid host or device address"); if ((void *)(d + s) > (void *)(pb + ps)) GOMP_PLUGIN_fatal ("invalid size"); #ifndef DISABLE_ASYNC if (nvthd->current_stream != nvthd->ptx_dev->null_stream) { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s\n", cuda_error (r)); event_gc (false); r = cuMemcpyDtoHAsync (h, (CUdeviceptr)d, s, nvthd->current_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyDtoHAsync error: %s", cuda_error (r)); r = cuEventRecord (*e, nvthd->current_stream->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_MEM, e, (void *)h); } else #endif { r = cuMemcpyDtoH (h, (CUdeviceptr)d, s); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpyDtoH error: %s", cuda_error (r)); } return 0; }
int main(int argc, char * argv[]) { CBlasUplo uplo; CBlasTranspose trans; size_t n, k; int d = 0; if (argc < 5 || argc > 6) { fprintf(stderr, "Usage: %s <uplo> <trans> <n> <k> [device]\n" "where:\n" " uplo is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n" " trans are 'n' or 'N' for CBlasNoTrans or 't' or 'T' for CBlasTrans\n" " n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char u; if (sscanf(argv[1], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1; } char t; if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': trans = CBlasNoTrans; break; case 'T': case 't': trans = CBlasTrans; break; case 'C': case 'c': trans = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 2; } if (sscanf(argv[3], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (argc > 5) { if (sscanf(argv[5], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } } srand(0); double alpha, beta, * A, * C, * refC; CUdeviceptr dA, dC; size_t lda, ldc, dlda, dldc; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = (double)rand() / (double)RAND_MAX; beta = (double)rand() / (double)RAND_MAX; if (trans == CBlasNoTrans) { lda = (n + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), k, sizeof(double))); dlda /= sizeof(double); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(double), n, sizeof(double))); dlda /= sizeof(double); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), k * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (n + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(double))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(double))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, n * sizeof(double), n, sizeof(double))); dldc /= sizeof(double); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) refC[j * ldc + i] = C[j * ldc + i] = (double)rand() / (double)RAND_MAX; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); dsyrk_ref(uplo, trans, n, k, alpha, A, lda, beta, refC, ldc); CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(double), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); double diff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) { double d = fabs(C[j * ldc + i] - refC[j * ldc + i]); if (d > diff) diff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuDsyrk(handle, uplo, trans, n, k, alpha, dA, dlda, beta, dC, dldc, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = 2 * k - 1; // k multiplies and k - 1 adds per element if (alpha != 1.0) flops += 1; // additional multiply by alpha if (beta != 0.0) flops += 2; // additional multiply and add by beta double error = (double)flops * 2.0 * DBL_EPSILON; // maximum per element error flops *= n * (n + 1) / 2; // n(n + 1) / 2 elements bool passed = (diff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
void ApexCudaProfileSession::start() { if (!mManager || !mManager->mApexScene) return; mLock.lock(); mMemBuf.seekWrite(0); uint32_t op = 0, sz, id = 0; const char* frameEvent = "Frame"; sz = sizeof(frameEvent); mMemBuf.write(&op, sizeof(op)); mMemBuf.write(&sz, sizeof(sz)); mMemBuf.write(frameEvent, sz); mMemBuf.write(&id, sizeof(id)); const char* summaryElapsed = "Summary of elapsed time"; sz = sizeof(summaryElapsed); id = 1; mMemBuf.write(&op, sizeof(op)); mMemBuf.write(&sz, sizeof(sz)); mMemBuf.write(summaryElapsed, sz); mMemBuf.write(&id, sizeof(id)); //Register kernels for (uint32_t i = 0; i < mManager->mKernels.size(); i++) { ApexCudaProfileManager::KernelInfo& ki = mManager->mKernels[i]; sz = ki.functionName.size(); mMemBuf.write(&op, sizeof(op)); mMemBuf.write(&sz, sizeof(sz)); mMemBuf.write(ki.functionName.c_str(), sz); mMemBuf.write(&ki.id, sizeof(ki.id)); ModuleSceneIntl* moduleScene = mManager->mApexScene->getInternalModuleScene(ki.moduleName.c_str()); ApexCudaObj* obj = NULL; if (moduleScene) { obj = static_cast<ApexCudaObj*>(moduleScene->getHeadCudaObj()); } while(obj) { if (obj->getType() == ApexCudaObj::FUNCTION) { if (ApexSimpleString(DYNAMIC_CAST(ApexCudaFunc*)(obj)->getName()) == ki.functionName) { DYNAMIC_CAST(ApexCudaFunc*)(obj)->setProfileSession(this); break; } } obj = obj->next(); } } { PxCudaContextManager* ctx = mManager->mApexScene->getTaskManager()->getGpuDispatcher()->getCudaContextManager(); PxScopedCudaLock s(*ctx); //Run timer if (mTimer == NULL) { CUT_SAFE_CALL(cuEventCreate((CUevent*)&mTimer, CU_EVENT_DEFAULT)); } CUT_SAFE_CALL(cuEventRecord((CUevent)mTimer, 0)); } mLock.unlock(); }
int main(int argc, char * argv[]) { CBlasSide side; CBlasUplo uplo; CBlasTranspose trans; CBlasDiag diag; size_t m, n; int d = 0; if (argc < 7 || argc > 8) { fprintf(stderr, "Usage: %s <side> <uplo> <trans> <diag> <m> <n> [device]\n" "where:\n" " side is 'l' or 'L' for CBlasLeft and 'r' or 'R' for CBlasRight\n" " uplo is 'u' or 'U' for CBlasUpper and 'l' or 'L' for CBlasLower\n" " trans is 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " diag is 'n' or 'N' for CBlasNonUnit and 'u' or 'U' for CBlasUnit\n" " m and n are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char s; if (sscanf(argv[1], "%c", &s) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (s) { case 'L': case 'l': side = CBlasLeft; break; case 'R': case 'r': side = CBlasRight; break; default: fprintf(stderr, "Unknown side '%c'\n", s); return 1; } char u; if (sscanf(argv[2], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 2; } char t; if (sscanf(argv[3], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[3]); return 3; } switch (t) { case 'N': case 'n': trans = CBlasNoTrans; break; case 'T': case 't': trans = CBlasTrans; break; case 'C': case 'c': trans = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 3; } char di; if (sscanf(argv[4], "%c", &di) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[4]); return 4; } switch (di) { case 'N': case 'n': diag = CBlasNonUnit; break; case 'U': case 'u': diag = CBlasUnit; break; default: fprintf(stderr, "Unknown diag '%c'\n", t); return 4; } if (sscanf(argv[5], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (sscanf(argv[6], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } if (argc > 7) { if (sscanf(argv[7], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[7]); return 7; } } srand(0); double complex alpha, * A, * B, * refB; CUdeviceptr dA, dB, dX; size_t lda, ldb, dlda, dldb, dldx; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; if (side == CBlasLeft) { lda = m; if ((A = malloc(lda * m * sizeof(double complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(double complex), m, sizeof(double complex))); dlda /= sizeof(double complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex), m * sizeof(double complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = n; if ((A = malloc(lda * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double complex), n, sizeof(double complex))); dlda /= sizeof(double complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double complex), n * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldb = m; if ((B = malloc(ldb * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -3; } if ((refB = malloc(ldb * n * sizeof(double complex))) == NULL) { fputs("Unable to allocate refB\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, m * sizeof(double complex), n, sizeof(double complex))); dldb /= sizeof(double complex); CU_ERROR_CHECK(cuMemAllocPitch(&dX, &dldx, m * sizeof(double complex), n, sizeof(double complex))); dldx /= sizeof(double complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refB[j * ldb + i] = B[j * ldb + i] = (double)rand() / (double)RAND_MAX + ((double)rand() / (double)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(double complex), m * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); ztrmm_ref(side, uplo, trans, diag, m, n, alpha, A, lda, refB, ldb); CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dX, NULL, dldx * sizeof(double complex), 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(double complex), m * sizeof(double complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); bool passed = true; double rdiff = 0.0, idiff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { double d = fabs(creal(B[j * ldb + i]) - creal(refB[j * ldb + i])); if (d > rdiff) rdiff = d; double c = fabs(cimag(B[j * ldb + i]) - cimag(refB[j * ldb + i])); if (c > idiff) idiff = c; size_t flops; if (side == CBlasLeft) flops = 2 * i + 1; else flops = 2 * j + 1; if (diag == CBlasNonUnit) flops++; flops *= 3; if (d > (double)flops * 2.0 * DBL_EPSILON || c > (double)flops * 2.0 * DBL_EPSILON) passed = false; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuZtrmm2(handle, side, uplo, trans, diag, m, n, alpha, dA, dlda, dB, dldb, dX, dldx, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); const size_t flops = (side == CBlasLeft) ? (6 * (n * m * (m + 1) / 2) + 2 * (n * m * (m - 1) / 2)) : (6 * (m * n * (n + 1) / 2) + 2 * (m * n * (n - 1) / 2)); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(refB); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dX)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
WEAK void halide_init_kernels(const char* ptx_src) { // If the context pointer isn't hooked up yet, point it at this module's weak-linkage context. if (cuda_ctx_ptr == NULL) { cuda_ctx_ptr = &weak_cuda_ctx; } // Initialize one shared context for all Halide compiled instances if (*cuda_ctx_ptr == 0) { // Initialize CUDA CHECK_CALL( cuInit(0), "cuInit" ); // Make sure we have a device int deviceCount = 0; CHECK_CALL( cuDeviceGetCount(&deviceCount), "cuDeviceGetCount" ); assert(deviceCount > 0); char *device_str = getenv("HL_GPU_DEVICE"); CUdevice dev; // Get device CUresult status; if (device_str) { status = cuDeviceGet(&dev, atoi(device_str)); } else { for (int id = 2; id >= 0; id--) { // Try to get a device >0 first, since 0 should be our display device status = cuDeviceGet(&dev, id); if (status == CUDA_SUCCESS) break; } } if (status != CUDA_SUCCESS) { fprintf(stderr, "Failed to get device\n"); exit(-1); } #ifndef NDEBUG fprintf(stderr, "Got device %d, about to create context (t=%d)\n", dev, halide_current_time()); #endif // Create context CHECK_CALL( cuCtxCreate(cuda_ctx_ptr, 0, dev), "cuCtxCreate" ); } else { //CHECK_CALL( cuCtxPushCurrent(*cuda_ctx_ptr), "cuCtxPushCurrent" ); } // Initialize a module for just this Halide module if (!__mod) { // Create module CHECK_CALL( cuModuleLoadData(&__mod, ptx_src), "cuModuleLoadData" ); #ifndef NDEBUG fprintf(stderr, "-------\nCompiling PTX:\n%s\n--------\n", ptx_src); #endif } // Create two events for timing if (!__start) { cuEventCreate(&__start, 0); cuEventCreate(&__end, 0); } }
static void vq_handle_output(VirtIODevice *vdev, VirtQueue *vq) { VirtQueueElement elem; while(virtqueue_pop(vq, &elem)) { struct param *p = elem.out_sg[0].iov_base; //for all library routines: get required arguments from buffer, execute, and push results back in virtqueue switch (p->syscall_type) { case CUINIT: { p->result = cuInit(p->flags); break; } case CUDRIVERGETVERSION: { p->result = cuDriverGetVersion(&p->val1); break; } case CUDEVICEGETCOUNT: { p->result = cuDeviceGetCount(&p->val1); break; } case CUDEVICEGET: { p->result = cuDeviceGet(&p->device, p->val1); break; } case CUDEVICECOMPUTECAPABILITY: { p->result = cuDeviceComputeCapability(&p->val1, &p->val2, p->device); break; } case CUDEVICEGETNAME: { p->result = cuDeviceGetName(elem.in_sg[0].iov_base, p->val1, p->device); break; } case CUDEVICEGETATTRIBUTE: { p->result = cuDeviceGetAttribute(&p->val1, p->attrib, p->device); break; } case CUCTXCREATE: { p->result = cuCtxCreate(&p->ctx, p->flags, p->device); break; } case CUCTXDESTROY: { p->result = cuCtxDestroy(p->ctx); break; } case CUCTXGETCURRENT: { p->result = cuCtxGetCurrent(&p->ctx); break; } case CUCTXGETDEVICE: { p->result = cuCtxGetDevice(&p->device); break; } case CUCTXPOPCURRENT: { p->result = cuCtxPopCurrent(&p->ctx); break; } case CUCTXSETCURRENT: { p->result = cuCtxSetCurrent(p->ctx); break; } case CUCTXSYNCHRONIZE: { p->result = cuCtxSynchronize(); break; } case CUMODULELOAD: { //hardcoded path - needs improvement //all .cubin files should be stored in $QEMU_NFS_PATH - currently $QEMU_NFS_PATH is shared between host and guest with NFS char *binname = malloc((strlen((char *)elem.out_sg[1].iov_base)+strlen(getenv("QEMU_NFS_PATH")+1))*sizeof(char)); if (!binname) { p->result = 0; virtqueue_push(vq, &elem, 0); break; } strcpy(binname, getenv("QEMU_NFS_PATH")); strcat(binname, (char *)elem.out_sg[1].iov_base); //change current CUDA context //each CUDA contets has its own virtual memory space - isolation is ensured by switching contexes if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleLoad(&p->module, binname); free(binname); break; } case CUMODULEGETGLOBAL: { char *name = malloc(100*sizeof(char)); if (!name) { p->result = 999; break; } strcpy(name, (char *)elem.out_sg[1].iov_base); p->result = cuModuleGetGlobal(&p->dptr,&p->size1,p->module,(const char *)name); break; } case CUMODULEUNLOAD: { p->result = cuModuleUnload(p->module); break; } case CUMEMALLOC: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAlloc(&p->dptr, p->bytesize); break; } case CUMEMALLOCPITCH: { if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuMemAllocPitch(&p->dptr, &p->size3, p->size1, p->size2, p->bytesize); break; } //large buffers are alocated in smaller chuncks in guest kernel space //gets each chunck seperately and copies it to device memory case CUMEMCPYHTOD: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoD(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYHTODASYNC: { int i; size_t offset; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.out_sg[1+2*i+1].iov_base; p->result = cuMemcpyHtoDAsync(p->dptr+offset, elem.out_sg[1+2*i].iov_base, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTODASYNC: { p->result = cuMemcpyDtoDAsync(p->dptr, p->dptr1, p->size1, p->stream); break; } case CUMEMCPYDTOH: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoH(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s); if (p->result != 0) break; offset += s; } break; } case CUMEMCPYDTOHASYNC: { int i; unsigned long s, nr_pages = p->nr_pages; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } size_t offset = 0; for (i=0; i<nr_pages; i++) { s = *(long *)elem.in_sg[0+2*i+1].iov_base; p->result = cuMemcpyDtoHAsync(elem.in_sg[0+2*i].iov_base, p->dptr+offset, s, p->stream); if (p->result != 0) break; offset += s; } break; } case CUMEMSETD32: { p->result = cuMemsetD32(p->dptr, p->bytecount, p->bytesize); break; } case CUMEMFREE: { p->result = cuMemFree(p->dptr); break; } case CUMODULEGETFUNCTION: { char *name = (char *)elem.out_sg[1].iov_base; name[p->length] = '\0'; if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuModuleGetFunction(&p->function, p->module, name); break; } case CULAUNCHKERNEL: { void **args = malloc(p->val1*sizeof(void *)); if (!args) { p->result = 9999; break; } int i; for (i=0; i<p->val1; i++) { args[i] = elem.out_sg[1+i].iov_base; } if (cuCtxSetCurrent(p->ctx) != 0) { p->result = 999; break; } p->result = cuLaunchKernel(p->function, p->gridDimX, p->gridDimY, p->gridDimZ, p->blockDimX, p->blockDimY, p->blockDimZ, p->bytecount, 0, args, 0); free(args); break; } case CUEVENTCREATE: { p->result = cuEventCreate(&p->event1, p->flags); break; } case CUEVENTDESTROY: { p->result = cuEventDestroy(p->event1); break; } case CUEVENTRECORD: { p->result = cuEventRecord(p->event1, p->stream); break; } case CUEVENTSYNCHRONIZE: { p->result = cuEventSynchronize(p->event1); break; } case CUEVENTELAPSEDTIME: { p->result = cuEventElapsedTime(&p->pMilliseconds, p->event1, p->event2); break; } case CUSTREAMCREATE: { p->result = cuStreamCreate(&p->stream, 0); break; } case CUSTREAMSYNCHRONIZE: { p->result = cuStreamSynchronize(p->stream); break; } case CUSTREAMQUERY: { p->result = cuStreamQuery(p->stream); break; } case CUSTREAMDESTROY: { p->result = cuStreamDestroy(p->stream); break; } default: printf("Unknown syscall_type\n"); } virtqueue_push(vq, &elem, 0); } //notify frontend - trigger virtual interrupt virtio_notify(vdev, vq); return; }
WEAK void halide_init_kernels(void *user_context, const char* ptx_src, int size) { // If the context pointer isn't hooked up yet, point it at this module's weak-linkage context. if (cuda_ctx_ptr == NULL) { cuda_ctx_ptr = &weak_cuda_ctx; } // Initialize one shared context for all Halide compiled instances if (*cuda_ctx_ptr == 0) { // Initialize CUDA CHECK_CALL( cuInit(0), "cuInit" ); // Make sure we have a device int deviceCount = 0; CHECK_CALL( cuDeviceGetCount(&deviceCount), "cuDeviceGetCount" ); halide_assert(user_context, deviceCount > 0); char *device_str = getenv("HL_GPU_DEVICE"); CUdevice dev; // Get device CUresult status; if (device_str) { status = cuDeviceGet(&dev, atoi(device_str)); } else { // Try to get a device >0 first, since 0 should be our display device // For now, don't try devices > 2 to maintain compatibility with previous behavior. if (deviceCount > 2) deviceCount = 2; for (int id = deviceCount - 1; id >= 0; id--) { status = cuDeviceGet(&dev, id); if (status == CUDA_SUCCESS) break; } } halide_assert(user_context, status == CUDA_SUCCESS && "Failed to get device\n"); #ifdef DEBUG halide_printf(user_context, "Got device %d, about to create context (t=%lld)\n", dev, (long long)halide_current_time_ns(user_context)); #endif // Create context CHECK_CALL( cuCtxCreate(cuda_ctx_ptr, 0, dev), "cuCtxCreate" ); } else { //CHECK_CALL( cuCtxPushCurrent(*cuda_ctx_ptr), "cuCtxPushCurrent" ); } // Initialize a module for just this Halide module if (!__mod) { // Create module CHECK_CALL( cuModuleLoadData(&__mod, ptx_src), "cuModuleLoadData" ); #ifdef DEBUG halide_printf(user_context, "-------\nCompiling PTX:\n%s\n--------\n", ptx_src); #endif } // Create two events for timing if (!__start) { cuEventCreate(&__start, 0); cuEventCreate(&__end, 0); } }
void nvptx_exec (void (*fn), size_t mapnum, void **hostaddrs, void **devaddrs, int async, unsigned *dims, void *targ_mem_desc) { struct targ_fn_descriptor *targ_fn = (struct targ_fn_descriptor *) fn; CUfunction function; CUresult r; int i; struct ptx_stream *dev_str; void *kargs[1]; void *hp, *dp; struct nvptx_thread *nvthd = nvptx_thread (); const char *maybe_abort_msg = "(perhaps abort was called)"; function = targ_fn->fn; dev_str = select_stream_for_async (async, pthread_self (), false, NULL); assert (dev_str == nvthd->current_stream); /* Initialize the launch dimensions. Typically this is constant, provided by the device compiler, but we must permit runtime values. */ for (i = 0; i != 3; i++) if (targ_fn->launch->dim[i]) dims[i] = targ_fn->launch->dim[i]; /* This reserves a chunk of a pre-allocated page of memory mapped on both the host and the device. HP is a host pointer to the new chunk, and DP is the corresponding device pointer. */ map_push (dev_str, async, mapnum * sizeof (void *), &hp, &dp); GOMP_PLUGIN_debug (0, " %s: prepare mappings\n", __FUNCTION__); /* Copy the array of arguments to the mapped page. */ for (i = 0; i < mapnum; i++) ((void **) hp)[i] = devaddrs[i]; /* Copy the (device) pointers to arguments to the device (dp and hp might in fact have the same value on a unified-memory system). */ r = cuMemcpy ((CUdeviceptr)dp, (CUdeviceptr)hp, mapnum * sizeof (void *)); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuMemcpy failed: %s", cuda_error (r)); GOMP_PLUGIN_debug (0, " %s: kernel %s: launch" " gangs=%u, workers=%u, vectors=%u\n", __FUNCTION__, targ_fn->launch->fn, dims[0], dims[1], dims[2]); // OpenACC CUDA // // num_gangs nctaid.x // num_workers ntid.y // vector length ntid.x kargs[0] = &dp; r = cuLaunchKernel (function, dims[GOMP_DIM_GANG], 1, 1, dims[GOMP_DIM_VECTOR], dims[GOMP_DIM_WORKER], 1, 0, dev_str->stream, kargs, 0); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuLaunchKernel error: %s", cuda_error (r)); #ifndef DISABLE_ASYNC if (async < acc_async_noval) { r = cuStreamSynchronize (dev_str->stream); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuStreamSynchronize error: %s", cuda_error (r)); } else { CUevent *e; e = (CUevent *)GOMP_PLUGIN_malloc (sizeof (CUevent)); r = cuEventCreate (e, CU_EVENT_DISABLE_TIMING); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuEventCreate error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventCreate error: %s", cuda_error (r)); event_gc (true); r = cuEventRecord (*e, dev_str->stream); if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuEventRecord error: %s", cuda_error (r)); event_add (PTX_EVT_KNL, e, (void *)dev_str); } #else r = cuCtxSynchronize (); if (r == CUDA_ERROR_LAUNCH_FAILED) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s %s\n", cuda_error (r), maybe_abort_msg); else if (r != CUDA_SUCCESS) GOMP_PLUGIN_fatal ("cuCtxSynchronize error: %s", cuda_error (r)); #endif GOMP_PLUGIN_debug (0, " %s: kernel %s: finished\n", __FUNCTION__, targ_fn->launch->fn); #ifndef DISABLE_ASYNC if (async < acc_async_noval) #endif map_pop (dev_str); }
int main() { CU_ERROR_CHECK(cuInit(0)); int count; CU_ERROR_CHECK(cuDeviceGetCount(&count)); for (int i = 0; i < count; i++) { CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, i)); int memoryClockRate, globalMemoryBusWidth; CU_ERROR_CHECK(cuDeviceGetAttribute(&memoryClockRate, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, device)); CU_ERROR_CHECK(cuDeviceGetAttribute(&globalMemoryBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, device)); // Calculate pin bandwidth in bytes/sec (clock rate is actual in kHz, memory is DDR so multiply clock rate by 2.e3 to get effective clock rate in Hz) double pinBandwidth = memoryClockRate * 2.e3 * (globalMemoryBusWidth / CHAR_BIT); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, 0, device)); fprintf(stdout, "Device %d (pin bandwidth %6.2f GB/s):\n", i, pinBandwidth / (1 << 30)); CUDA_MEMCPY2D copy; copy.srcMemoryType = CU_MEMORYTYPE_DEVICE; copy.dstMemoryType = CU_MEMORYTYPE_DEVICE; CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_DEFAULT)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_DEFAULT)); float time; // Calculate aligned copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\taligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate misaligned copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE; copy.Height = 1; copy.srcXInBytes = j; copy.srcY = 0; copy.dstXInBytes = j; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t j = 0; j < ITERATIONS; j++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tmisaligned copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-2 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 2; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 2; copy.dstPitch *= 2; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-2 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-10 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 10; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 10; copy.dstPitch *= 10; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t i = 0; i < ITERATIONS; i++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-10 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } // Calculate stride-1000 copy for 32, 64 and 128-bit word sizes for (unsigned int j = 4; j <= 16; j *= 2) { copy.WidthInBytes = SIZE / 1000; copy.Height = 1; copy.srcXInBytes = 0; copy.srcY = 0; copy.dstXInBytes = 0; copy.dstY = 0; CU_ERROR_CHECK(cuMemAllocPitch(©.srcDevice, ©.srcPitch, copy.srcXInBytes + copy.WidthInBytes, copy.Height, j)); CU_ERROR_CHECK(cuMemAllocPitch(©.dstDevice, ©.dstPitch, copy.dstXInBytes + copy.WidthInBytes, copy.Height, j)); copy.srcPitch *= 1000; copy.dstPitch *= 1000; CU_ERROR_CHECK(cuEventRecord(start, 0)); for (size_t j = 0; j < ITERATIONS; j++) CU_ERROR_CHECK(cuMemcpy2D(©)); CU_ERROR_CHECK(cuEventRecord(stop, 0)); CU_ERROR_CHECK(cuEventSynchronize(stop)); CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= ITERATIONS * 1.e3f; double bandwidth = (double)(2 * copy.WidthInBytes * copy.Height) / time; fprintf(stdout, "\tstride-1000 copy (%3u-bit): %6.2f GB/s (%5.2f%%)\n", j * CHAR_BIT, bandwidth / (1 << 30), (bandwidth / pinBandwidth) * 100.0); CU_ERROR_CHECK(cuMemFree(copy.srcDevice)); CU_ERROR_CHECK(cuMemFree(copy.dstDevice)); } CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); CU_ERROR_CHECK(cuCtxDestroy(context)); } return 0; }
void GPUInterface::SetDevice(int deviceNumber, int paddedStateCount, int categoryCount, int paddedPatternCount, int unpaddedPatternCount, int tipCount, long flags) { #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tEntering GPUInterface::SetDevice\n"); #endif SAFE_CUDA(cuDeviceGet(&cudaDevice, (*resourceMap)[deviceNumber])); unsigned int ctxFlags = CU_CTX_SCHED_AUTO; if (flags & BEAGLE_FLAG_SCALING_DYNAMIC) { ctxFlags |= CU_CTX_MAP_HOST; } CUresult error = cuCtxCreate(&cudaContext, ctxFlags, cudaDevice); if(error != CUDA_SUCCESS) { fprintf(stderr, "CUDA error: \"%s\" (%d) from file <%s>, line %i.\n", GetCUDAErrorDescription(error), error, __FILE__, __LINE__); if (error == CUDA_ERROR_INVALID_DEVICE) { fprintf(stderr, "(The requested CUDA device is likely set to compute exclusive mode. This mode prevents multiple processes from running on the device.)"); } exit(-1); } InitializeKernelResource(paddedStateCount, flags & BEAGLE_FLAG_PRECISION_DOUBLE); if (!kernelResource) { fprintf(stderr,"Critical error: unable to find kernel code for %d states.\n",paddedStateCount); exit(-1); } kernelResource->categoryCount = categoryCount; kernelResource->patternCount = paddedPatternCount; kernelResource->unpaddedPatternCount = unpaddedPatternCount; kernelResource->flags = flags; SAFE_CUDA(cuModuleLoadData(&cudaModule, kernelResource->kernelCode)); if ((paddedPatternCount < BEAGLE_MULTI_GRID_MAX || flags & BEAGLE_FLAG_PARALLELOPS_GRID) && !(flags & BEAGLE_FLAG_PARALLELOPS_STREAMS)) { numStreams = 1; cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); cudaStreams[0] = NULL; CUevent event; for(int i=0; i<2; i++) { SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } } else { numStreams = tipCount/2 + 1; if (numStreams > BEAGLE_STREAM_COUNT) { numStreams = BEAGLE_STREAM_COUNT; } cudaStreams = (CUstream*) malloc(sizeof(CUstream) * numStreams); CUstream stream; cudaEvents = (CUevent*) malloc(sizeof(CUevent) * (numStreams + 1)); CUevent event; for(int i=0; i<numStreams; i++) { SAFE_CUDA(cuStreamCreate(&stream, CU_STREAM_DEFAULT)); cudaStreams[i] = stream; SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[i] = event; } SAFE_CUDA(cuEventCreate(&event, CU_EVENT_DISABLE_TIMING)); cudaEvents[numStreams] = event; } SAFE_CUDA(cuCtxPopCurrent(&cudaContext)); #ifdef BEAGLE_DEBUG_FLOW fprintf(stderr,"\t\t\tLeaving GPUInterface::SetDevice\n"); #endif }
int main(int argc, char * argv[]) { CBlasUplo uplo; size_t n; int d = 0; if (argc < 3 || argc > 4) { fprintf(stderr, "Usage: %s <uplo> <n>\n" "where:\n" " uplo is 'u' or 'U' for CBlasUpper or 'l' or 'L' for CBlasLower\n" " n is the size of the matrix\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char u; if (sscanf(argv[1], "%c", &u) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (u) { case 'U': case 'u': uplo = CBlasUpper; break; case 'L': case 'l': uplo = CBlasLower; break; default: fprintf(stderr, "Unknown uplo '%c'\n", u); return 1; } if (sscanf(argv[2], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[2]); return 2; } if (argc > 3) { if (sscanf(argv[3], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } } srand(0); double * A, * refA; CUdeviceptr dA; size_t lda, dlda; long info, rInfo; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CULAPACKhandle handle; CU_ERROR_CHECK(cuLAPACKCreate(&handle)); lda = (n + 1u) & ~1u; if ((A = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } if ((refA = malloc(lda * n * sizeof(double))) == NULL) { fputs("Unable to allocate refA\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, n * sizeof(double), n, sizeof(double))); dlda /= sizeof(double); if (dlatmc(n, 2.0, A, lda) != 0) { fputs("Unable to initialise A\n", stderr); return -1; } // dpotrf(uplo, n, A, lda, &info); // if (info != 0) { // fputs("Failed to compute Cholesky decomposition of A\n", stderr); // return (int)info; // } for (size_t j = 0; j < n; j++) memcpy(&refA[j * lda], &A[j * lda], n * sizeof(double)); CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); dlauum_ref(uplo, n, refA, lda, &rInfo); CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); bool passed = (info == rInfo); double diff = 0.0; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) { double d = fabs(A[j * lda + i] - refA[j * lda + i]); if (d > diff) diff = d; } } // Set A to identity so that repeated applications of the cholesky // decomposition while benchmarking do not exit early due to // non-positive-definite-ness. for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < n; i++) A[j * lda + i] = (i == j) ? 1.0 : 0.0; } copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(double), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(double), n * sizeof(double), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuDlauum(handle, uplo, n, dA, dlda, &info)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); const size_t flops = ((n * n * n) / 3) + ((n * n) / 2) + (n / 6); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, diff, (passed) ? "PASS" : "FAIL"); free(A); free(refA); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuLAPACKDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }
int main(int argc, char* argv[]) { //int iTest = 2896; //while (iTest < 0x7fff) //{ // int iResult = iTest * iTest; // float fTest = (float)iTest; // int fResult = (int)(fTest * fTest); // printf("i*i:%08x f*f:%08x\n", iResult, fResult); // iTest += 0x0800; //} //exit(0); char deviceName[32]; int devCount, ordinal, major, minor; CUdevice hDevice; // Initialize the Driver API and find a device CUDA_CHECK( cuInit(0) ); CUDA_CHECK( cuDeviceGetCount(&devCount) ); for (ordinal = 0; ordinal < devCount; ordinal++) { CUDA_CHECK( cuDeviceGet(&hDevice, ordinal) ); CUDA_CHECK( cuDeviceGetAttribute (&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, hDevice) ); CUDA_CHECK( cuDeviceGetAttribute (&minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, hDevice) ); CUDA_CHECK( cuDeviceGetName(deviceName, sizeof(deviceName), hDevice) ); if (major >= 5 && minor >= 2) { printf("Using: Id:%d %s (%d.%d)\n\n", ordinal, deviceName, major, minor); break; } } if (ordinal == devCount) { printf("No compute 5.0 device found, exiting.\n"); exit(EXIT_FAILURE); } // First command line arg is the type: internal (CS2R) or external (cuEventElapsedTime) timing int internalTiming = 1; if (argc > 1) internalTiming = strcmp(argv[1], "i") == 0 ? 1 : 0; // Second command line arg is the number of blocks int blocks = 1; if (argc > 2) blocks = atoi(argv[2]); if (blocks < 1) blocks = 1; // Third command line arg is the number of threads int threads = 128; if (argc > 3) threads = atoi(argv[3]); if (threads > 1024 || threads < 32) threads = 128; threads &= -32; // Forth command line arg: double fops = 1.0; int lanes = 1; if (argc > 4) { if (internalTiming) { // The number of lanes to print for each warp lanes = atoi(argv[4]); if (lanes > 32 || lanes < 1) lanes = 1; } else // The number of floating point operations in a full kernel launch fops = atof(argv[4]); } // Fifth command line arg is the repeat count for benchmarking int repeat = 1; if (argc > 5) repeat = atoi(argv[5]); if (repeat > 1000 || repeat < 1) repeat = 1; // threads = total number of threads size_t size = sizeof(int) * threads * blocks; // Setup our input and output buffers int* dataIn = (int*)malloc(size); int* dataOut = (int*)malloc(size); int* clocks = (int*)malloc(size); memset(dataIn, 0, size); CUmodule hModule; CUfunction hKernel; CUevent hStart, hStop; CUdeviceptr devIn, devOut, devClocks; // Init our context and device memory buffers CUDA_CHECK( cuCtxCreate(&hContext, 0, hDevice) ); CUDA_CHECK( cuMemAlloc(&devIn, size) ); CUDA_CHECK( cuMemAlloc(&devOut, size) ); CUDA_CHECK( cuMemAlloc(&devClocks, size) ); CUDA_CHECK( cuMemcpyHtoD(devIn, dataIn, size) ); CUDA_CHECK( cuMemsetD8(devOut, 0, size) ); CUDA_CHECK( cuMemsetD8(devClocks, 0, size) ); CUDA_CHECK( cuEventCreate(&hStart, CU_EVENT_BLOCKING_SYNC) ); CUDA_CHECK( cuEventCreate(&hStop, CU_EVENT_BLOCKING_SYNC) ); // Load our kernel CUDA_CHECK( cuModuleLoad(&hModule, "microbench.cubin") ); CUDA_CHECK( cuModuleGetFunction(&hKernel, hModule, "microbench") ); // Setup the params void* params[] = { &devOut, &devClocks, &devIn }; float ms = 0; // Warm up the clock (unless under nsight) if (!getenv("NSIGHT_LAUNCHED")) // NSIGHT_CUDA_ANALYSIS NSIGHT_CUDA_DEBUGGER for (int i = 0; i < repeat; i++) CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); // Launch the kernel CUDA_CHECK( cuEventRecord(hStart, NULL) ); //CUDA_CHECK( cuProfilerStart() ); CUDA_CHECK( cuLaunchKernel(hKernel, blocks, 1, 1, threads, 1, 1, 0, 0, params, 0) ); //CUDA_CHECK( cuProfilerStop() ); CUDA_CHECK( cuEventRecord(hStop, NULL) ); CUDA_CHECK( cuEventSynchronize(hStop) ); CUDA_CHECK( cuEventElapsedTime(&ms, hStart, hStop) ); //CUDA_CHECK( cuCtxSynchronize() ); // Get back our results from each kernel CUDA_CHECK( cuMemcpyDtoH(dataOut, devOut, size) ); CUDA_CHECK( cuMemcpyDtoH(clocks, devClocks, size) ); // Cleanup and shutdown of cuda CUDA_CHECK( cuEventDestroy(hStart) ); CUDA_CHECK( cuEventDestroy(hStop) ); CUDA_CHECK( cuModuleUnload(hModule) ); CUDA_CHECK( cuMemFree(devIn) ); CUDA_CHECK( cuMemFree(devOut) ); CUDA_CHECK( cuMemFree(devClocks) ); CUDA_CHECK( cuCtxDestroy(hContext) ); hContext = 0; // When using just one block, print out the internal timing data if (internalTiming) { int count = 0, total = 0, min = 999999, max = 0; int* clocks_p = clocks; int* dataOut_p = dataOut; // Loop over and print results for (int blk = 0; blk < blocks; blk++) { float *fDataOut = reinterpret_cast<float*>(dataOut_p); for(int tid = 0; tid < threads; tid += 32) { // Sometimes we want data on each thread, sometimes just one sample per warp is fine for (int lane = 0; lane < lanes; lane++) printf("b:%02d w:%03d t:%04d l:%02d clocks:%08d out:%08x\n", blk, tid/32, tid, lane, clocks_p[tid+lane], dataOut_p[tid+lane]); // %04u count++; total += clocks_p[tid]; if (clocks_p[tid] < min) min = clocks_p[tid]; if (clocks_p[tid] > max) max = clocks_p[tid]; } clocks_p += threads; dataOut_p += threads; } printf("average: %.3f, min %d, max: %d\n", (float)total/count, min, max); } else { // For more than one block we're testing throughput and want external timing data printf("MilliSecs: %.3f, GFLOPS: %.3f\n", ms, fops / (ms * 1000000.0)); } // And free up host memory free(dataIn); free(dataOut); free(clocks); return 0; }
int main(int argc, char * argv[]) { CBlasTranspose transA, transB; size_t m, n, k; int d = 0; if (argc < 6 || argc > 7) { fprintf(stderr, "Usage: %s <transA> <transB> <m> <n> <k> [device]\n" "where:\n" " transA and transB are 'n' or 'N' for CBlasNoTrans, 't' or 'T' for CBlasTrans or 'c' or 'C' for CBlasConjTrans\n" " m, n and k are the sizes of the matrices\n" " device is the GPU to use (default 0)\n", argv[0]); return 1; } char t; if (sscanf(argv[1], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[1]); return 1; } switch (t) { case 'N': case 'n': transA = CBlasNoTrans; break; case 'T': case 't': transA = CBlasTrans; break; case 'C': case 'c': transA = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[2], "%c", &t) != 1) { fprintf(stderr, "Unable to read character from '%s'\n", argv[2]); return 2; } switch (t) { case 'N': case 'n': transB = CBlasNoTrans; break; case 'T': case 't': transB = CBlasTrans; break; case 'C': case 'c': transB = CBlasConjTrans; break; default: fprintf(stderr, "Unknown transpose '%c'\n", t); return 1; } if (sscanf(argv[3], "%zu", &m) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[3]); return 3; } if (sscanf(argv[4], "%zu", &n) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[4]); return 4; } if (sscanf(argv[5], "%zu", &k) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[5]); return 5; } if (argc > 6) { if (sscanf(argv[6], "%d", &d) != 1) { fprintf(stderr, "Unable to parse number from '%s'\n", argv[6]); return 6; } } srand(0); float complex alpha, beta, * A, * B, * C, * refC; CUdeviceptr dA, dB, dC, dD; size_t lda, ldb, ldc, dlda, dldb, dldc, dldd; CU_ERROR_CHECK(cuInit(0)); CUdevice device; CU_ERROR_CHECK(cuDeviceGet(&device, d)); CUcontext context; CU_ERROR_CHECK(cuCtxCreate(&context, CU_CTX_SCHED_BLOCKING_SYNC, device)); CUBLAShandle handle; CU_ERROR_CHECK(cuBLASCreate(&handle)); alpha = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; beta = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; if (transA == CBlasNoTrans) { lda = (m + 1u) & ~1u; if ((A = malloc(lda * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, m * sizeof(float complex), k, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < m; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), m * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { lda = (k + 1u) & ~1u; if ((A = malloc(lda * m * sizeof(float complex))) == NULL) { fputs("Unable to allocate A\n", stderr); return -1; } CU_ERROR_CHECK(cuMemAllocPitch(&dA, &dlda, k * sizeof(float complex), m, sizeof(float complex))); dlda /= sizeof(float complex); for (size_t j = 0; j < m; j++) { for (size_t i = 0; i < k; i++) A[j * lda + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, A, 0, NULL, lda * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dA, NULL, dlda * sizeof(float complex), k * sizeof(float complex), m }; CU_ERROR_CHECK(cuMemcpy2D(©)); } if (transB == CBlasNoTrans) { ldb = (k + 1u) & ~1u; if ((B = malloc(ldb * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, k * sizeof(float complex), n, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < k; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), k * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); } else { ldb = (n + 1u) & ~1u; if ((B = malloc(ldb * k * sizeof(float complex))) == NULL) { fputs("Unable to allocate B\n", stderr); return -2; } CU_ERROR_CHECK(cuMemAllocPitch(&dB, &dldb, n * sizeof(float complex), k, sizeof(float complex))); dldb /= sizeof(float complex); for (size_t j = 0; j < k; j++) { for (size_t i = 0; i < n; i++) B[j * ldb + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, B, 0, NULL, ldb * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dB, NULL, dldb * sizeof(float complex), n * sizeof(float complex), k }; CU_ERROR_CHECK(cuMemcpy2D(©)); } ldc = (m + 1u) & ~1u; if ((C = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate C\n", stderr); return -3; } if ((refC = malloc(ldc * n * sizeof(float complex))) == NULL) { fputs("Unable to allocate refC\n", stderr); return -4; } CU_ERROR_CHECK(cuMemAllocPitch(&dC, &dldc, m * sizeof(float complex), n, sizeof(float complex))); dldc /= sizeof(float complex); CU_ERROR_CHECK(cuMemAllocPitch(&dD, &dldd, m * sizeof(float complex), n, sizeof(float complex))); dldd /= sizeof(float complex); for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) refC[j * ldc + i] = C[j * ldc + i] = ((float)rand() / (float)RAND_MAX) + ((float)rand() / (float)RAND_MAX) * I; } CUDA_MEMCPY2D copy = { 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dC, NULL, dldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); cgemm_ref(transA, transB, m, n, k, alpha, A, lda, B, ldb, beta, refC, ldc); CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); copy = (CUDA_MEMCPY2D){ 0, 0, CU_MEMORYTYPE_DEVICE, NULL, dD, NULL, dldd * sizeof(float complex), 0, 0, CU_MEMORYTYPE_HOST, C, 0, NULL, ldc * sizeof(float complex), m * sizeof(float complex), n }; CU_ERROR_CHECK(cuMemcpy2D(©)); float rdiff = 0.0f, idiff = 0.0f; for (size_t j = 0; j < n; j++) { for (size_t i = 0; i < m; i++) { float d = fabsf(crealf(C[j * ldc + i]) - crealf(refC[j * ldc + i])); if (d > rdiff) rdiff = d; d = fabsf(cimagf(C[j * ldc + i]) - cimagf(refC[j * ldc + i])); if (d > idiff) idiff = d; } } CUevent start, stop; CU_ERROR_CHECK(cuEventCreate(&start, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventCreate(&stop, CU_EVENT_BLOCKING_SYNC)); CU_ERROR_CHECK(cuEventRecord(start, NULL)); for (size_t i = 0; i < 20; i++) CU_ERROR_CHECK(cuCgemm2(handle, transA, transB, m, n, k, alpha, dA, dlda, dB, dldb, beta, dC, dldc, dD, dldd, NULL)); CU_ERROR_CHECK(cuEventRecord(stop, NULL)); CU_ERROR_CHECK(cuEventSynchronize(stop)); float time; CU_ERROR_CHECK(cuEventElapsedTime(&time, start, stop)); time /= 20; CU_ERROR_CHECK(cuEventDestroy(start)); CU_ERROR_CHECK(cuEventDestroy(stop)); size_t flops = k * 6 + (k - 1) * 2; // k multiplies and k - 1 adds per element if (alpha != 1.0f + 0.0f * I) flops += 6; // additional multiply by alpha if (beta != 0.0f + 0.0f * I) flops += 8; // additional multiply and add by beta float error = (float)flops * 2.0f * FLT_EPSILON; // maximum per element error flops *= m * n; // m * n elements bool passed = (rdiff <= error) && (idiff <= error); fprintf(stdout, "%.3es %.3gGFlops/s Error: %.3e + %.3ei\n%sED!\n", time * 1.e-3f, ((float)flops * 1.e-6f) / time, rdiff, idiff, (passed) ? "PASS" : "FAIL"); free(A); free(B); free(C); free(refC); CU_ERROR_CHECK(cuMemFree(dA)); CU_ERROR_CHECK(cuMemFree(dB)); CU_ERROR_CHECK(cuMemFree(dC)); CU_ERROR_CHECK(cuMemFree(dD)); CU_ERROR_CHECK(cuBLASDestroy(handle)); CU_ERROR_CHECK(cuCtxDestroy(context)); return (int)!passed; }