void runBenchmark(int iterations) { // once without timing to prime the GPU nbody->update(activeParams.m_timestep); cutilSafeCall(cudaEventRecord(startEvent, 0)); for (int i = 0; i < iterations; ++i) { nbody->update(activeParams.m_timestep); } cutilSafeCall(cudaEventRecord(stopEvent, 0)); cudaEventSynchronize(stopEvent); float milliseconds = 0; cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); printf("%d bodies, total time for %d iterations: %0.3f ms\n", numBodies, iterations, milliseconds); printf("= %0.3f billion interactions per second\n", interactionsPerSecond); printf("= %0.3f GFLOP/s at %d flops per interaction\n", gflops, 20); }
float TimerGPU::read() { cudaEventRecord(stop_, stream_); cudaEventSynchronize(stop_); float time; cudaEventElapsedTime(&time, start_, stop_); return time; }
//----------------------------------------------------------------------------- void CUDA::Timer::Stop () { cudaEventRecord(mStop, 0); cudaEventSynchronize(mStop); cudaEventElapsedTime(&mTime, mStart, mStop); mState = CT_STOPPED; }
/* Assumes that all recorded events have completed */ static pb_Timestamp record_async_times(struct pb_TimerSet* tset) { struct pb_async_time_marker_list * next_interval = NULL; struct pb_async_time_marker_list * last_marker = get_last_async(tset); pb_Timestamp total_async_time = 0; enum pb_TimerID timer; for(next_interval = tset->async_markers; next_interval != last_marker; next_interval = next_interval->next) { float interval_time_ms; cudaEventElapsedTime(&interval_time_ms, *((cudaEvent_t *)next_interval->marker), *((cudaEvent_t *)next_interval->next->marker)); pb_Timestamp interval = (pb_Timestamp) (interval_time_ms * 1e3); tset->timers[next_interval->timerID].elapsed += interval; if (next_interval->label != NULL) { struct pb_SubTimer *subtimer = tset->sub_timer_list[next_interval->timerID]->subtimer_list; while (subtimer != NULL) { if ( strcmp(subtimer->label, next_interval->label) == 0) { subtimer->timer.elapsed += interval; break; } subtimer = subtimer->next; } } total_async_time += interval; next_interval->timerID = INVALID_TIMERID; } if(next_interval != NULL) next_interval->timerID = INVALID_TIMERID; return total_async_time; }
float Timer::MicroSeconds() { if (!has_run_at_least_once()) { LOG(WARNING)<< "Timer has never been run before reading time."; return 0; } if (running()) { Stop(); } if (Caffe::mode() == Caffe::GPU && Caffe::GetDefaultDevice()->backend() == BACKEND_CUDA) { #ifndef CPU_ONLY #ifdef USE_CUDA CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, stop_gpu_)); // Cuda only measure milliseconds elapsed_microseconds_ = elapsed_milliseconds_ * 1000; #endif // USE_CUDA #else NO_GPU; #endif } else { elapsed_microseconds_ = (stop_cpu_ - start_cpu_).total_microseconds(); } return elapsed_microseconds_; }
unsigned int GetTimeMillis () { float elapsedTime; cudaEventRecord(timerStop,0); cudaEventSynchronize(timerStop); cudaEventElapsedTime(&elapsedTime, timerStart, timerStop); return (unsigned int)(elapsedTime); }
double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3) { cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); for(std::size_t i = 0; i < num_trials; ++i) { f(arg1,arg2,arg3); } cudaEventRecord(stop); cudaThreadSynchronize(); float msecs = 0; cudaEventElapsedTime(&msecs, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); // return mean msecs return msecs / num_trials; }
int main() { cudaEvent_t start; cudaEvent_t end; float duration; const float overestimateRate = 0.01f; const float errorRate = 0.01f; Tokenizer tokenizer( overestimateRate, errorRate ); /************** Test counting string tokens *************/ TextReader reader; cudaEventCreate( &start ); cudaEventRecord( start, 0 ); reader.Read(); tokenizer.StartTokenizing( reader.GetCharBuffer(), reader.GetOffsetBuffer(), reader.GetCharBufferSize(), reader.GetOffsetBufferSize() ); cudaEventCreate( &end ); cudaEventRecord( end, 0 ); cudaEventSynchronize( end ); cudaEventElapsedTime( &duration, start, end ); printf( "Time taken: %.3lf milliseconds\n", duration ); tokenizer.GetFrequency( "a" ); }
float Elapsed() { float elapsed; cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsed, start, stop); return elapsed; }
void runCuda() { ////////////////////// // Timing cuda call // ////////////////////// float time; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); // Map OpenGL buffer object for writing from CUDA on a single GPU // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer dptr=NULL; vbo = mesh->getVBO(); vbosize = mesh->getVBOsize(); nbo = mesh->getNBO(); nbosize = mesh->getNBOsize(); #if RGBONLY == 1 float newcbo[] = {0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; cbo = newcbo; cbosize = 9; #elif RGBONLY == 0 vec3 defaultColor(0.5f, 0.5f, 0.5f); mesh->changeColor(defaultColor); cbo = mesh->getCBO(); cbosize = mesh->getCBOsize(); #endif ibo = mesh->getIBO(); ibosize = mesh->getIBOsize(); cudaGLMapBufferObject((void**)&dptr, pbo); updateCamera(); cudaRasterizeCore(cam, dptr, glm::vec2(width, height), frame, vbo, vbosize, cbo, cbosize, ibo, ibosize, nbo, nbosize, lights, lightsize, alpha, beta, displayMode); cudaGLUnmapBufferObject(pbo); vbo = NULL; cbo = NULL; ibo = NULL; frame++; fpstracker++; ////////////////////// // Timing cuda call // ////////////////////// cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("runCuda runtime: %3.1f ms \n", time); }
float libcgt::cuda::Event::synchronizeAndGetMillisecondsElapsed() { cudaEventSynchronize( m_stop ); float ms; cudaEventElapsedTime( &ms, m_start, m_stop ); return ms; }
//----------------------------------------------------------------------------// double CUDAImpl::_StopTimer() { cudaEventRecord(_stop, 0); cudaEventSynchronize(_stop); float time; cudaEventElapsedTime(&time, _start, _stop); return time; }
double CudaTimer::Split() { cudaEventRecord(end); cudaDeviceSynchronize(); float t; cudaEventElapsedTime(&t, start, end); start.Swap(end); return (t / 1000.0); }
void contractTT(sTensorGPU *TT1, sTensorGPU *TT2, const int n, const int size) { cublasHandle_t handle; cublasCreate(&handle); type result=0; sTensorGPU temp1 = emptyTensor(size*size,2); sTensorGPU temp2 = emptyTensor(size*size*2,3); cudaEvent_t start; cudaEventCreate(&start); cudaEvent_t stop; cudaEventCreate(&stop); //printf("Start contractTT\n"); cudaEventRecord(start, NULL); int indA = TT1[0].size[0]; int indB = TT2[0].size[0]; sTensorCPU tt1start = copyToCPU(TT1[0]); sTensorCPU tt2start = copyToCPU(TT2[0]); sTensorCPU tt1end = copyToCPU(TT1[n - 1]); sTensorCPU tt2end = copyToCPU( TT2[n - 1]); for (int i = 0; i < indA; i++){ TT1[0] = prepareTensorStart(tt1start, i); TT1[n - 1] = prepareTensorEnd(tt1end, i); for (int j = 0; j < indB; j++){ TT2[0] = prepareTensorStart(tt2start, j); TT2[n - 1] = prepareTensorEnd(tt2end, j); contractTensor(handle, TT1[0], TT2[0], temp1); for (int i = 1; i < n; i++){ contractTensor(handle, temp1, TT1[i], temp2); contractTensor(handle, temp2, TT2[i], temp1, 2); } type add = 0; cudaMemcpy(&add, temp1.deviceData, sizeof(type), cudaMemcpyDeviceToHost); //printf("%e ", add); result += add; } } cudaEventRecord(stop, NULL); cudaEventSynchronize(stop); float msecTotal = 0.0f; cudaEventElapsedTime(&msecTotal, start, stop); printf("Time: %.3fms\n", msecTotal); printf("Ops: %.0f\n", bops); double gigaFlops = (bops * 1.0e-9f) / (msecTotal / 1000.0f); printf("Perf= %.2f GFlop/s\n", gigaFlops); cublasDestroy(handle); cudaDeviceReset(); printf("%.5e \n", result); exit(0); }
float gpuNUFFT::GpuNUFFTOperator::stopTiming() { float time; HANDLE_ERROR( cudaEventRecord(stop, 0) ); HANDLE_ERROR( cudaEventSynchronize(stop) ); HANDLE_ERROR( cudaEventElapsedTime(&time, start, stop) ); return time; }
NVENCSTATUS NVEncFilter::filter(FrameInfo *pInputFrame, FrameInfo **ppOutputFrames, int *pOutputFrameNum) { cudaError_t cudaerr = cudaSuccess; if (m_bCheckPerformance) { cudaerr = cudaEventRecord(*m_peFilterStart.get()); if (cudaerr != cudaSuccess) { AddMessage(RGY_LOG_ERROR, _T("failed cudaEventRecord(m_peFilterStart): %s.\n"), char_to_tstring(cudaGetErrorString(cudaerr)).c_str()); } } if (pInputFrame == nullptr) { *pOutputFrameNum = 0; ppOutputFrames[0] = nullptr; } if (m_pParam && m_pParam->bOutOverwrite //上書きか? && pInputFrame != nullptr && pInputFrame->ptr != nullptr //入力が存在するか? && ppOutputFrames != nullptr && ppOutputFrames[0] == nullptr) { //出力先がセット可能か? ppOutputFrames[0] = pInputFrame; *pOutputFrameNum = 1; } const auto ret = run_filter(pInputFrame, ppOutputFrames, pOutputFrameNum); const int nOutFrame = *pOutputFrameNum; if (!m_pParam->bOutOverwrite && nOutFrame > 0) { if (m_nPathThrough & FILTER_PATHTHROUGH_TIMESTAMP) { if (nOutFrame != 1) { AddMessage(RGY_LOG_ERROR, _T("timestamp path through can only be applied to 1-in/1-out filter.\n")); return NV_ENC_ERR_INVALID_CALL; } else { ppOutputFrames[0]->timestamp = pInputFrame->timestamp; ppOutputFrames[0]->duration = pInputFrame->duration; } } for (int i = 0; i < nOutFrame; i++) { if (m_nPathThrough & FILTER_PATHTHROUGH_FLAGS) ppOutputFrames[i]->flags = pInputFrame->flags; if (m_nPathThrough & FILTER_PATHTHROUGH_PICSTRUCT) ppOutputFrames[i]->picstruct = pInputFrame->picstruct; } } if (m_bCheckPerformance) { cudaerr = cudaEventRecord(*m_peFilterFin.get()); if (cudaerr != cudaSuccess) { AddMessage(RGY_LOG_ERROR, _T("failed cudaEventRecord(m_peFilterFin): %s.\n"), char_to_tstring(cudaGetErrorString(cudaerr)).c_str()); } cudaerr = cudaEventSynchronize(*m_peFilterFin.get()); if (cudaerr != cudaSuccess) { AddMessage(RGY_LOG_ERROR, _T("failed cudaEventSynchronize(m_peFilterFin): %s.\n"), char_to_tstring(cudaGetErrorString(cudaerr)).c_str()); } float time_ms = 0.0f; cudaerr = cudaEventElapsedTime(&time_ms, *m_peFilterStart.get(), *m_peFilterFin.get()); if (cudaerr != cudaSuccess) { AddMessage(RGY_LOG_ERROR, _T("failed cudaEventElapsedTime(m_peFilterStart - m_peFilterFin): %s.\n"), char_to_tstring(cudaGetErrorString(cudaerr)).c_str()); } m_dFilterTimeMs += time_ms; m_nFilterRunCount++; } return ret; }
void sobel1(int *h_result, unsigned int *h_pic, int xsize, int ysize, int thresh) { int *d_result; unsigned int *d_pic; int resultSize = xsize * ysize * 3 * sizeof(int); int picSize = xsize * ysize * sizeof(int); cudaMalloc( (void**)&d_result, resultSize); if( !d_result) { exit(-1); } cudaMalloc( (void**)&d_pic, picSize); if( !d_pic) { exit(-1); } cudaMemcpy(d_result, h_result, resultSize, cudaMemcpyHostToDevice); cudaMemcpy(d_pic, h_pic, picSize, cudaMemcpyHostToDevice); dim3 threadsPerBlock(BLOCKSIZE, BLOCKSIZE); dim3 numBlocks(ceil((float)ysize/(float)threadsPerBlock.x), ceil((float)xsize/(float)threadsPerBlock.y)); cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); { __set_CUDAConfig(numBlocks, threadsPerBlock ); d_sobel1 (d_result, d_pic, xsize, ysize, thresh);} cudaEventSynchronize(stop); float elapsedTime; cudaEventElapsedTime(&elapsedTime, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); cudaMemcpy(h_result, d_result, resultSize, cudaMemcpyDeviceToHost); cudaMemcpy(h_pic, d_pic, picSize, cudaMemcpyDeviceToHost); cudaFree(d_result); cudaFree(d_pic); }
// execute kernel double dslashCUDA(int niter) { cudaEvent_t start, end; cudaEventCreate(&start); cudaEventCreate(&end); cudaEventRecord(start, 0); for (int i = 0; i < niter; i++) { switch (test_type) { case 0: if (transfer) { dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); } else { //inv_param.input_location = QUDA_CUDA_FIELD_LOCATION; //inv_param.output_location = QUDA_CUDA_FIELD_LOCATION; //dslashQuda(cudaSpinorOut->V(), cudaSpinor->V(), &inv_param, parity); dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: case 2: if (transfer) { MatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } break; case 3: case 4: if (transfer) { MatDagMatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->MdagM(*cudaSpinorOut, *cudaSpinor); } break; } } cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) printfQuda("with ERROR: %s\n", cudaGetErrorString(stat)); return secs; }
void stop_timing_cuda(cudaEvent_t* start,cudaEvent_t* stop, char* info_str) { realw time; // stops events cudaEventRecord( *stop, 0 ); cudaEventSynchronize( *stop ); cudaEventElapsedTime( &time, *start, *stop ); cudaEventDestroy( *start ); cudaEventDestroy( *stop ); // user output printf("%s: Execution Time = %f ms\n",info_str,time); }
// execute kernel double dslashCUDA() { printfQuda("Executing %d kernel loops...\n", loops); fflush(stdout); if (test_type < 2) dirac->Tune(*cudaSpinorOut, *cudaSpinor, *tmp); else dirac->Tune(cudaSpinorOut->Even(), cudaSpinor->Even(), *tmp); cudaEvent_t start, end; cudaEventCreate(&start); cudaEventRecord(start, 0); cudaEventSynchronize(start); for (int i = 0; i < loops; i++) { switch (test_type) { case 0: if (transfer) { dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); } else { dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: case 2: if (transfer) { MatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } break; } } cudaEventCreate(&end); cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) printf("with ERROR: %s\n", cudaGetErrorString(stat)); printf("done.\n\n"); return secs; }
int main(void){ float a[N] = {0}; float *d_a, *d_b; int size = N * sizeof(float); float cdiff; cudaEvent_t cstart, cstop; cudaMalloc((void **) &d_a, size); cudaMalloc((void **) &d_b, size); cudaEventCreate(&cstart); cudaEventCreate(&cstop); cudaEventRecord(cstart, 0); cudaMemcpy(d_a, a, size, cudaMemcpyHostToDevice); cudaEventRecord(cstop, 0); cudaEventSynchronize(cstop); cudaEventElapsedTime(&cdiff, cstart, cstop); printf("HOST TO DEVICE : %f Go/s\n", ((1000 * size) / ((1024 * 1024 * 1024) * cdiff))); cudaEventCreate(&cstart); cudaEventCreate(&cstop); cudaEventRecord(cstart, 0); cudaMemcpy(a, d_a, size, cudaMemcpyDeviceToHost); cudaEventRecord(cstop, 0); cudaEventSynchronize(cstop); cudaEventElapsedTime(&cdiff, cstart, cstop); printf("DEVICE TO HOST : %f Go/s\n", ((1000 * size) / ((1024 * 1024 * 1024) * cdiff))); cudaEventCreate(&cstart); cudaEventCreate(&cstop); cudaEventRecord(cstart, 0); cudaMemcpy(d_b, d_a, size, cudaMemcpyDeviceToDevice); cudaEventRecord(cstop, 0); cudaEventSynchronize(cstop); cudaEventElapsedTime(&cdiff, cstart, cstop); printf("DEVICE TO DEVICE : %f Go/s\n", 2 * ((1000 * size) / ((1024 * 1024 * 1024) * cdiff))); cudaFree(d_a); cudaFree(d_b); system("pause"); return 0; }
float CudaTimer::ElapsedTime() const { assert(stopped); if (!stopped) { return 0.0f; } float elapsedTime; cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); return elapsedTime; }
int main(int argc, char **argv) { // device memory real *psi_d, *z_d; size_t fSize = sizeof(real); /* grid dimensions */ unsigned int Nx = 513, Ny = 513; // omitting boundaries unsigned int nGridPoints = (Nx-2)*(Ny-2); cudaMalloc((void **) &psi_d, (nGridPoints+1)*fSize); cudaMalloc((void **) &z_d, (nGridPoints+1)*fSize); /* initialization */ fillArray(psi_d, 0.0, nGridPoints+1); fillArray(z_d, 1.0, nGridPoints+1); checkCudaError("Initialization of grid"); // for timing purposes cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // start timer cudaEventRecord(start,0); /* Call the poisson solver, right hand side * is stored on the device in z_d (make sure the data * is copied from CPU to GPU!), result is stored in * psi_d (on the GPU/device). * Here NX-2 is the width of the grid's interior * (without the boundaries). */ cuPoisson((Nx-2), psi_d, z_d); // stop timer cudaEventRecord(stop,0); cudaEventSynchronize(stop); float computationTime; cudaEventElapsedTime(&computationTime, start, stop); printf("Computation time was %.5f seconds.\n\n", computationTime/1000.0); printf("Writing result to disk...\n"); // write result to file writeBinaryFile(Nx, Ny, psi_d, "data.dat"); printf("done\n"); return EXIT_SUCCESS; }
double dslashCUDA(int niter) { cudaEvent_t start, end; cudaEventCreate(&start); cudaEventRecord(start, 0); cudaEventSynchronize(start); for (int i = 0; i < niter; i++) { switch (test_type) { case 0: parity = QUDA_EVEN_PARITY; if (transfer){ //dslashQuda(spinorOdd, spinorEven, &inv_param, parity); } else { dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: parity = QUDA_ODD_PARITY; if (transfer){ //MatPCQuda(spinorOdd, spinorEven, &inv_param); } else { dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 2: if (transfer){ //MatQuda(spinorGPU, spinor, &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } } } cudaEventCreate(&end); cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) errorQuda("with ERROR: %s\n", cudaGetErrorString(stat)); return secs; }
// use_cuda_time = 1: use cudaEventElapsedTime() // or use getSystemTime() void test_2gpu(float *d_send_data, float *d_recv_data, int size, int id0, int id1, bool use_cuda_time) { if(use_cuda_time) { cudaEvent_t start_event, stop_event; float time_memcpy; // version I //cudaEventCreate(&start_event); //cudaEventCreate(&stop_event); //cudaEventRecord(start_event, 0); // version II int eventflags = cudaEventBlockingSync; cudaEventCreateWithFlags(&start_event, eventflags); cudaEventCreateWithFlags(&stop_event, eventflags); cudaEventRecord(start_event, 0); for(int i=0; i<CNT; i++) { cudaMemcpy(d_recv_data, d_send_data, size*sizeof(float), cudaMemcpyDeviceToDevice); } std::cout << "hello, use_cuda_time" << std::endl; cudaEventRecord(stop_event, 0); cudaEventSynchronize(stop_event); cudaEventElapsedTime(&time_memcpy, start_event, stop_event); // ms std::cout << "Time is " << time_memcpy/1000. << "s" << std::endl; std::cout << "GPU" << id0 << " ---> GPU" << id1 << " :" << WIDTH*HEIGHT*sizeof(float)*CNT*1000./(1024*1024*time_memcpy) << "MB/s" << std::endl; cudaEventDestroy(start_event); cudaEventDestroy(stop_event); } else { //cudaEvent_t start_event; //cudaEventCreate(&start_event); long long start = getSystemTime(); for(int i=0; i<CNT; i++) { cudaMemcpy(d_recv_data, d_send_data, size*sizeof(float), cudaMemcpyDeviceToDevice); //cudaMemcpyPeer(d_recv_data, id1, d_send_data, id0, size*sizeof(float)); } //cudaEventRecord(start_event, 0); //cudaEventSynchronize(start_event); long long end = getSystemTime(); std::cout << "Time is " << (end-start)/1000. << "s" << std::endl; std::cout << "GPU" << id0 << " ---> GPU" << id1 << " :" << WIDTH*HEIGHT*sizeof(float)*CNT*1000./(1024*1024*(end - start+1)) << "MB/s" << std::endl; } //WIDTH*HEIGHT*4.*CNT/(1000*(end - start)) << "Mb/s" << std::endl; }
// finishTest ------------------------------------------------------------------ // Initializes the cuda timer events and starts the timer. // @param start - Start time evet // @param end - End time evet // @returns the elapsed time in seconds. //----------------------------------------------------------------------------- float finishTest(cudaEvent_t &start, cudaEvent_t &stop){ float time; cudaEventRecord( stop, 0 ); cudaEventSynchronize( stop ); cudaEventElapsedTime( &time, start, stop ); cudaEventDestroy( start ); cudaEventDestroy( stop ); printf("Finished Test in %f s\n\n", time/1000.0f); // Check for errors checkCUDAError("test finished"); // Return elapsed time return time/1000.0f; }
float CCudaTimeMeasure::GetTimeout(bool bResetStart/* = false*/) { cudaCheckError(cudaEventRecord(m_ceStopEvent, m_csStreamID)); cudaCheckError(cudaEventSynchronize(m_ceStopEvent)); float fElapsedTime = 0.0f; cudaCheckError(cudaEventElapsedTime(&fElapsedTime, m_ceStartEvent, m_ceStopEvent)); if (bResetStart) { cudaCheckError(cudaEventRecord(m_ceStartEvent, m_csStreamID)); } return fElapsedTime; }
void _runBenchmark(int iterations) { // once without timing to prime the device if (!useCpu) { m_nbody->update(activeParams.m_timestep); } if (useCpu) { sdkCreateTimer(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventRecord(startEvent, 0)); } for (int i = 0; i < iterations; ++i) { m_nbody->update(activeParams.m_timestep); } float milliseconds = 0; if (useCpu) { sdkStopTimer(&timer); milliseconds = sdkGetTimerValue(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventRecord(stopEvent, 0)); checkCudaErrors(cudaEventSynchronize(stopEvent)); checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); printf("%d bodies, total time for %d iterations: %.3f ms, mean %f\n", numBodies, iterations, milliseconds, milliseconds/iterations); printf("= %.3f billion interactions per second\n", interactionsPerSecond); printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); }
float Timer::MilliSeconds() { if (!has_run_at_least_once()) { LOG(WARNING) << "Timer has never been run before reading time."; return 0; } if (running()) { Stop(); } if (Caffe::mode() == Caffe::GPU) { CUDA_CHECK(cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_, stop_gpu_)); } else { elapsed_milliseconds_ = (stop_cpu_ - start_cpu_).total_milliseconds(); } return elapsed_milliseconds_; }
/* * Stops the CUDA timer started using the given CUDA event. * * Returns the elapsed time (in ms) or a negative value on error. */ float stop_cuda_timer_ev( cudaEvent_t start_timing_event ) { float elapsed_time = 0.0f; #if NMFGPU_PROFILING_TRANSF || NMFGPU_PROFILING_KERNELS cudaError_t cuda_status = cudaSuccess; cudaEvent_t stop_timing_event = timing_events[ STOP_EVENT ]; // ---------------------- // Records the current "timestamp" for ALL previous operations. cuda_status = cudaEventRecord( stop_timing_event, 0 ); if ( cuda_status != cudaSuccess ) { print_error( sys_error_shown_by_all, "CUDA Error detected: %s\n", cudaGetErrorString(cuda_status) ); return -1.0f; } /* Waits for the registered operations (all). * NOTE: The CPU thread will block or spin according to flags * specified in init_timing_events(). */ cuda_status = cudaEventSynchronize( stop_timing_event ); if ( cuda_status != cudaSuccess ) { print_error( sys_error_shown_by_all, "CUDA Error detected: %s\n", cudaGetErrorString(cuda_status) ); return -1.0f; } cuda_status = cudaEventElapsedTime( &elapsed_time, start_timing_event, stop_timing_event ); if ( cuda_status != cudaSuccess ) { print_error( sys_error_shown_by_all, "Error retrieving elapsed time: %s\n", cudaGetErrorString(cuda_status) ); return -1.0f; } if ( ! isfinite( elapsed_time ) ) { print_error( sys_error_shown_by_all, "Invalid elapsed time: %g\n", elapsed_time ); return -1.0f; } #endif /* if NMFGPU_PROFILING_TRANSF || NMFGPU_PROFILING_KERNELS */ return elapsed_time; } // stop_cuda_timer_ev