cudaTimer::cudaTimer(){ cudaError_t status; status = cudaEventCreate(&inicio); status = cudaEventCreate(&fin); }
double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3) { cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); for(std::size_t i = 0; i < num_trials; ++i) { f(arg1,arg2,arg3); } cudaEventRecord(stop); cudaThreadSynchronize(); float msecs = 0; cudaEventElapsedTime(&msecs, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); // return mean msecs return msecs / num_trials; }
float bench::ClockBenchmark::_determineCycleTime() { cudaEvent_t start, end; check( cudaEventCreate(&start) ); check( cudaEventCreate(&end) ); unsigned long long elapsedCycles; unsigned long long* deviceElapsedCycles; long long int* deviceDummyMem; const dim3 grid(1,1,1), block(1,1,1); check( cudaMalloc((void**)&deviceElapsedCycles, sizeof(unsigned long long)) ); check( cudaMalloc((void**)&deviceDummyMem, sizeof(long long int)) ); check( cudaEventRecord(start) ); cudaDetermineCycleTimeWrapper(deviceElapsedCycles, deviceDummyMem, grid, block); check( cudaEventRecord(end) ); check( cudaDeviceSynchronize() ); check( cudaMemcpy(&elapsedCycles, deviceElapsedCycles, sizeof(unsigned long long), cudaMemcpyDeviceToHost) ); float elapsedTime = 0; check( cudaEventElapsedTime(&elapsedTime, start, end) ); report(util::Indents(2) << "elapsed time: " << elapsedTime << "ms"); report(util::Indents(2) << "elapsed cycles: " << elapsedCycles); return elapsedTime * 1000000.0 / (float)elapsedCycles; }
ProfilerDPD::ProfilerDPD(): count(0), tf(0) { CUDA_CHECK(cudaEventCreate(&evstart)); CUDA_CHECK(cudaEventCreate(&evforce)); _flush(true); }
int main() { cudaEvent_t start; cudaEvent_t end; float duration; const float overestimateRate = 0.01f; const float errorRate = 0.01f; Tokenizer tokenizer( overestimateRate, errorRate ); /************** Test counting string tokens *************/ TextReader reader; cudaEventCreate( &start ); cudaEventRecord( start, 0 ); reader.Read(); tokenizer.StartTokenizing( reader.GetCharBuffer(), reader.GetOffsetBuffer(), reader.GetCharBufferSize(), reader.GetOffsetBufferSize() ); cudaEventCreate( &end ); cudaEventRecord( end, 0 ); cudaEventSynchronize( end ); cudaEventElapsedTime( &duration, start, end ); printf( "Time taken: %.3lf milliseconds\n", duration ); tokenizer.GetFrequency( "a" ); }
void trainMethodsSpeedTestGPU(fann *ann, fann_train_data* train, unsigned int trainingAlgorithm, unsigned int epochCount) { fann *gpunn = fann_copy(ann); gpunn->training_algorithm = (fann_train_enum)trainingAlgorithm; { cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); gpuann_fann_parallel_train_on_data(gpunn, train, epochCount); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); printf("%10.5f ", time); } fann_destroy(gpunn); }
unsigned int StartTimer () { cudaEventCreate(&timerStart); cudaEventCreate(&timerStop); cudaEventRecord(timerStart,0); return 0; }
void runCuda() { ////////////////////// // Timing cuda call // ////////////////////// float time; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); // Map OpenGL buffer object for writing from CUDA on a single GPU // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer dptr=NULL; vbo = mesh->getVBO(); vbosize = mesh->getVBOsize(); nbo = mesh->getNBO(); nbosize = mesh->getNBOsize(); #if RGBONLY == 1 float newcbo[] = {0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; cbo = newcbo; cbosize = 9; #elif RGBONLY == 0 vec3 defaultColor(0.5f, 0.5f, 0.5f); mesh->changeColor(defaultColor); cbo = mesh->getCBO(); cbosize = mesh->getCBOsize(); #endif ibo = mesh->getIBO(); ibosize = mesh->getIBOsize(); cudaGLMapBufferObject((void**)&dptr, pbo); updateCamera(); cudaRasterizeCore(cam, dptr, glm::vec2(width, height), frame, vbo, vbosize, cbo, cbosize, ibo, ibosize, nbo, nbosize, lights, lightsize, alpha, beta, displayMode); cudaGLUnmapBufferObject(pbo); vbo = NULL; cbo = NULL; ibo = NULL; frame++; fpstracker++; ////////////////////// // Timing cuda call // ////////////////////// cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("runCuda runtime: %3.1f ms \n", time); }
CudaTimer::CudaTimer(cudaStream_t stream) : start(nullptr) , end(nullptr) , stream(stream) { CUDA_CHECK(cudaEventCreate(&start)); CUDA_CHECK(cudaEventCreate(&end)); }
CUDATimer() { start_ = 0.0; stop_ = 0.0; elapsed_ = 0.0; is_running_ = false; cudaEventCreate(&custart_); cudaEventCreate(&custop_); cudaEventCreate(&cubase_); cudaEventRecord(cubase_); } // CUDATimer()
void contractTT(sTensorGPU *TT1, sTensorGPU *TT2, const int n, const int size) { cublasHandle_t handle; cublasCreate(&handle); type result=0; sTensorGPU temp1 = emptyTensor(size*size,2); sTensorGPU temp2 = emptyTensor(size*size*2,3); cudaEvent_t start; cudaEventCreate(&start); cudaEvent_t stop; cudaEventCreate(&stop); //printf("Start contractTT\n"); cudaEventRecord(start, NULL); int indA = TT1[0].size[0]; int indB = TT2[0].size[0]; sTensorCPU tt1start = copyToCPU(TT1[0]); sTensorCPU tt2start = copyToCPU(TT2[0]); sTensorCPU tt1end = copyToCPU(TT1[n - 1]); sTensorCPU tt2end = copyToCPU( TT2[n - 1]); for (int i = 0; i < indA; i++){ TT1[0] = prepareTensorStart(tt1start, i); TT1[n - 1] = prepareTensorEnd(tt1end, i); for (int j = 0; j < indB; j++){ TT2[0] = prepareTensorStart(tt2start, j); TT2[n - 1] = prepareTensorEnd(tt2end, j); contractTensor(handle, TT1[0], TT2[0], temp1); for (int i = 1; i < n; i++){ contractTensor(handle, temp1, TT1[i], temp2); contractTensor(handle, temp2, TT2[i], temp1, 2); } type add = 0; cudaMemcpy(&add, temp1.deviceData, sizeof(type), cudaMemcpyDeviceToHost); //printf("%e ", add); result += add; } } cudaEventRecord(stop, NULL); cudaEventSynchronize(stop); float msecTotal = 0.0f; cudaEventElapsedTime(&msecTotal, start, stop); printf("Time: %.3fms\n", msecTotal); printf("Ops: %.0f\n", bops); double gigaFlops = (bops * 1.0e-9f) / (msecTotal / 1000.0f); printf("Perf= %.2f GFlop/s\n", gigaFlops); cublasDestroy(handle); cudaDeviceReset(); printf("%.5e \n", result); exit(0); }
void Timer::Init() { if (!initted()) { if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaEventCreate(&start_gpu_)); CUDA_CHECK(cudaEventCreate(&stop_gpu_)); } initted_ = true; } }
// startTest ------------------------------------------------------------------ // Initializes the cuda timer events and starts the timer. // @param start - Start time evet // @param end - End time evet //----------------------------------------------------------------------------- void startTest(cudaEvent_t &start, cudaEvent_t &stop, char* msg){ // Create Events cudaEventCreate( &start ); cudaEventCreate( &stop ); // Start Timer printf("%s\n", msg); cudaEventRecord( start, 0 ); }
void sobel1(int *h_result, unsigned int *h_pic, int xsize, int ysize, int thresh) { int *d_result; unsigned int *d_pic; int resultSize = xsize * ysize * 3 * sizeof(int); int picSize = xsize * ysize * sizeof(int); cudaMalloc( (void**)&d_result, resultSize); if( !d_result) { exit(-1); } cudaMalloc( (void**)&d_pic, picSize); if( !d_pic) { exit(-1); } cudaMemcpy(d_result, h_result, resultSize, cudaMemcpyHostToDevice); cudaMemcpy(d_pic, h_pic, picSize, cudaMemcpyHostToDevice); dim3 threadsPerBlock(BLOCKSIZE, BLOCKSIZE); dim3 numBlocks(ceil((float)ysize/(float)threadsPerBlock.x), ceil((float)xsize/(float)threadsPerBlock.y)); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); { __set_CUDAConfig(numBlocks, threadsPerBlock ); d_sobel1 (d_result, d_pic, xsize, ysize, thresh);} cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); cudaMemcpy(h_result, d_result, resultSize, cudaMemcpyDeviceToHost); cudaMemcpy(h_pic, d_pic, picSize, cudaMemcpyDeviceToHost); cudaFree(d_result); cudaFree(d_pic); }
void startTimer_GPU( MTime* mtime ) { mtime->type = GPU_TIME; if(mtime->gpustart==0 || mtime->gpustop==0) { CHECK_ERROR(cudaEventCreate(&mtime->gpustart)); CHECK_ERROR(cudaEventCreate(&mtime->gpustop)); } CHECK_ERROR( cudaEventRecord(mtime->gpustart) ); }
CCudaTimeMeasure::CCudaTimeMeasure(cudaStream_t csStreamID/* = 0*/): m_ceStartEvent(NULL), m_ceStopEvent(NULL), m_csStreamID(csStreamID) { cudaCheckError(cudaEventCreate(&m_ceStartEvent)); cudaCheckError(cudaEventCreate(&m_ceStopEvent)); cudaCheckError(cudaEventRecord(m_ceStartEvent, m_csStreamID)); }
void Mark(){ // insert a "start" event into the command stream cudaEventCreate(&start); // instert a "stop" event into the command stream cudaEventCreate(&stop); // record the event cudaEventRecord( start, 0 ); status = 1; }
// execute kernel double dslashCUDA() { printfQuda("Executing %d kernel loops...\n", loops); fflush(stdout); if (test_type < 2) dirac->Tune(*cudaSpinorOut, *cudaSpinor, *tmp); else dirac->Tune(cudaSpinorOut->Even(), cudaSpinor->Even(), *tmp); cudaEvent_t start, end; cudaEventCreate(&start); cudaEventRecord(start, 0); cudaEventSynchronize(start); for (int i = 0; i < loops; i++) { switch (test_type) { case 0: if (transfer) { dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); } else { dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: case 2: if (transfer) { MatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } break; } } cudaEventCreate(&end); cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) printf("with ERROR: %s\n", cudaGetErrorString(stat)); printf("done.\n\n"); return secs; }
//----------------------------------------------------------------------------// CUDAImpl::CUDAImpl() : ImageProcessor(CUDA), _cudaBuild(false) { if (cudaSetDevice(_cudaGetMaxGflopsDeviceId()) != cudaSuccess) { throw std::logic_error("gpuip::CUDAImpl() could not set device id"); }; cudaFree(0); //use runtime api to create a CUDA context implicitly cudaEventCreate(&_start); cudaEventCreate(&_stop); }
// execute kernel double dslashCUDA(int niter) { cudaEvent_t start, end; cudaEventCreate(&start); cudaEventCreate(&end); cudaEventRecord(start, 0); for (int i = 0; i < niter; i++) { switch (test_type) { case 0: if (transfer) { dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); } else { //inv_param.input_location = QUDA_CUDA_FIELD_LOCATION; //inv_param.output_location = QUDA_CUDA_FIELD_LOCATION; //dslashQuda(cudaSpinorOut->V(), cudaSpinor->V(), &inv_param, parity); dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: case 2: if (transfer) { MatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } break; case 3: case 4: if (transfer) { MatDagMatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->MdagM(*cudaSpinorOut, *cudaSpinor); } break; } } cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) printfQuda("with ERROR: %s\n", cudaGetErrorString(stat)); return secs; }
void Timer::Init() { if (!initted()) { if (Caffe::mode() == Caffe::GPU) { #ifndef CPU_ONLY CUDA_CHECK(cudaEventCreate(&start_gpu_)); CUDA_CHECK(cudaEventCreate(&stop_gpu_)); #else NO_GPU; #endif } initted_ = true; } }
int main(int argc, char **argv) { // device memory real *psi_d, *z_d; size_t fSize = sizeof(real); /* grid dimensions */ unsigned int Nx = 513, Ny = 513; // omitting boundaries unsigned int nGridPoints = (Nx-2)*(Ny-2); cudaMalloc((void **) &psi_d, (nGridPoints+1)*fSize); cudaMalloc((void **) &z_d, (nGridPoints+1)*fSize); /* initialization */ fillArray(psi_d, 0.0, nGridPoints+1); fillArray(z_d, 1.0, nGridPoints+1); checkCudaError("Initialization of grid"); // for timing purposes cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // start timer cudaEventRecord(start,0); /* Call the poisson solver, right hand side * is stored on the device in z_d (make sure the data * is copied from CPU to GPU!), result is stored in * psi_d (on the GPU/device). * Here NX-2 is the width of the grid's interior * (without the boundaries). */ cuPoisson((Nx-2), psi_d, z_d); // stop timer cudaEventRecord(stop,0); cudaEventSynchronize(stop); float computationTime; cudaEventElapsedTime(&computationTime, start, stop); printf("Computation time was %.5f seconds.\n\n", computationTime/1000.0); printf("Writing result to disk...\n"); // write result to file writeBinaryFile(Nx, Ny, psi_d, "data.dat"); printf("done\n"); return EXIT_SUCCESS; }
double dslashCUDA(int niter) { cudaEvent_t start, end; cudaEventCreate(&start); cudaEventRecord(start, 0); cudaEventSynchronize(start); for (int i = 0; i < niter; i++) { switch (test_type) { case 0: parity = QUDA_EVEN_PARITY; if (transfer){ //dslashQuda(spinorOdd, spinorEven, &inv_param, parity); } else { dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: parity = QUDA_ODD_PARITY; if (transfer){ //MatPCQuda(spinorOdd, spinorEven, &inv_param); } else { dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 2: if (transfer){ //MatQuda(spinorGPU, spinor, &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } } } cudaEventCreate(&end); cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) errorQuda("with ERROR: %s\n", cudaGetErrorString(stat)); return secs; }
void Timer::Init() { if (!initted()) { if (Caffe::mode() == Caffe::GPU && Caffe::GetDefaultDevice()->backend() == BACKEND_CUDA) { #ifndef CPU_ONLY #ifdef USE_CUDA CUDA_CHECK(cudaEventCreate(&start_gpu_)); CUDA_CHECK(cudaEventCreate(&stop_gpu_)); #endif // USE_CUDA #else NO_GPU; #endif } initted_ = true; } }
static void _init_cuda_checkpoints(QSP_ARG_DECL int n) { //CUresult e; cudaError_t drv_err; int i; if( max_cuda_checkpoints > 0 ){ sprintf(ERROR_STRING, "init_cuda_checkpoints(%d): already initialized with %d checpoints", n,max_cuda_checkpoints); warn(ERROR_STRING); return; } ckpt_tbl = (Cuda_Checkpoint *) getbuf( n * sizeof(*ckpt_tbl) ); if( ckpt_tbl == NULL ) error1("failed to allocate checkpoint table"); max_cuda_checkpoints = n; for(i=0;i<max_cuda_checkpoints;i++){ drv_err=cudaEventCreate(&ckpt_tbl[i].ckpt_event); if( drv_err != cudaSuccess ){ describe_cuda_driver_error2("init_cuda_checkpoints", "cudaEventCreate",drv_err); error1("failed to initialize checkpoint table"); } ckpt_tbl[i].ckpt_tag=NULL; } }
TEST(EventRecord, RecordAfterDestroy) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; cudaError_t ret; cudaEvent_t event; cudaStream_t stream; ret = cudaEventCreate(&event); ASSERT_EQ(cudaSuccess, ret); ret = cudaEventDestroy(event); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); #if CUDART_VERSION >= 5000 ret = cudaEventRecord(event); EXPECT_EQ(cudaErrorUnknown, ret); #else EXPECT_EXIT( cudaEventRecord(event, stream), ::testing::KilledBySignal(SIGSEGV), ""); #endif ret = cudaStreamDestroy(stream); EXPECT_EQ(cudaSuccess, ret); }
void _init(int numBodies, int numDevices, int p, int q, bool bUsePBO, bool useHostMem, bool useCpu) { if (useCpu) { m_nbodyCpu = new BodySystemCPU<T>(numBodies); m_nbody = m_nbodyCpu; m_nbodyCuda = 0; } else { m_nbodyCuda = new BodySystemCUDA<T>(numBodies, numDevices, p, q, bUsePBO, useHostMem); m_nbody = m_nbodyCuda; m_nbodyCpu = 0; } // allocate host memory m_hPos = new T[numBodies*4]; m_hVel = new T[numBodies*4]; m_hColor = new float[numBodies*4]; m_nbody->setSoftening(activeParams.m_softening); m_nbody->setDamping(activeParams.m_damping); if (useCpu) { sdkCreateTimer(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventCreate(&startEvent)); checkCudaErrors(cudaEventCreate(&stopEvent)); checkCudaErrors(cudaEventCreate(&hostMemSyncEvent)); } if (!benchmark && !compareToCPU) { m_renderer = new ParticleRenderer; _resetRenderer(); } sdkCreateTimer(&demoTimer); sdkStartTimer(&demoTimer); }
// ---------------------------------------- void trace_gpu_end( int dev, int s ) { int t = dev*glog.nstream + s; int id = glog.gpu_id[t]; cudaEventCreate( &glog.gpu_end[t][id] ); cudaEventRecord( glog.gpu_end[t][id], glog.streams[t] ); if ( id+1 < MAX_EVENTS ) { glog.gpu_id[t] = id+1; } }
static void do_main() { Matrix A; double *x, *b; printf("#############################################################################################\n"); printf("** B I C G S T A B S O L V E R **\n"); printf("#############################################################################################\n\n"); cudaDeviceProp props; CUDA_SAFE_CALL( cudaGetDeviceProperties(&props, 0) ); printf("** DEVICE : %10s (ECC: %3s) **\n", props.name, props.ECCEnabled ? "ON" : "OFF"); printf("\n#############################################################################################\n\n"); Context ctx; ctx.read_from_file("config.txt"); read_system_from_file(&ctx, "res/matrix.inp", &A, &x, &b); cudaEvent_t start, stop; CUDA_SAFE_CALL( cudaEventCreate(&start) ); CUDA_SAFE_CALL( cudaEventCreate(&stop) ); CUDA_SAFE_CALL( cudaEventRecord(start) ); //Jacobi solver(0.6); Bicgstab solver; solver.setup(&ctx, &A); solver.solve(&ctx, &A, x, b); float elapsed_time = 0.0f; CUDA_SAFE_CALL( cudaEventRecord(stop) ); CUDA_SAFE_CALL( cudaEventSynchronize(stop) ); CUDA_SAFE_CALL( cudaEventElapsedTime(&elapsed_time, start, stop) ); printf("** ELAPSED TIME: %9.3fms **\n", elapsed_time); printf("\n#############################################################################################\n\n"); CUDA_SAFE_CALL( cudaEventDestroy(stop) ); CUDA_SAFE_CALL( cudaEventDestroy(start) ); CUDA_SAFE_CALL( cudaFree(x) ); CUDA_SAFE_CALL( cudaFree(b) ); }
static int cutorch_Event_new(lua_State *L) { cudaEvent_t *event = luaT_alloc(L, sizeof(cudaEvent_t)); THCudaCheck(cudaEventCreate(event)); THCState *state = cutorch_getstate(L); THCudaCheck(cudaEventRecord(*event, THCState_getCurrentStream(state))); luaT_pushudata(L, event, "cutorch.Event"); return 1; }