void runBenchmark(int iterations) { // once without timing to prime the GPU nbody->update(activeParams.m_timestep); cutilSafeCall(cudaEventRecord(startEvent, 0)); for (int i = 0; i < iterations; ++i) { nbody->update(activeParams.m_timestep); } cutilSafeCall(cudaEventRecord(stopEvent, 0)); cudaEventSynchronize(stopEvent); float milliseconds = 0; cutilSafeCall( cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); printf("%d bodies, total time for %d iterations: %0.3f ms\n", numBodies, iterations, milliseconds); printf("= %0.3f billion interactions per second\n", interactionsPerSecond); printf("= %0.3f GFLOP/s at %d flops per interaction\n", gflops, 20); }
TEST(EventRecord, RecordAfterDestroy) { ::testing::FLAGS_gtest_death_test_style = "threadsafe"; cudaError_t ret; cudaEvent_t event; cudaStream_t stream; ret = cudaEventCreate(&event); ASSERT_EQ(cudaSuccess, ret); ret = cudaEventDestroy(event); EXPECT_EQ(cudaSuccess, ret); ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); #if CUDART_VERSION >= 5000 ret = cudaEventRecord(event); EXPECT_EQ(cudaErrorUnknown, ret); #else EXPECT_EXIT( cudaEventRecord(event, stream), ::testing::KilledBySignal(SIGSEGV), ""); #endif ret = cudaStreamDestroy(stream); EXPECT_EQ(cudaSuccess, ret); }
double time_invocation_cuda(std::size_t num_trials, Function f, Arg1 arg1, Arg2 arg2, Arg3 arg3) { cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); for(std::size_t i = 0; i < num_trials; ++i) { f(arg1,arg2,arg3); } cudaEventRecord(stop); cudaThreadSynchronize(); float msecs = 0; cudaEventElapsedTime(&msecs, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); // return mean msecs return msecs / num_trials; }
void trainMethodsSpeedTestGPU(fann *ann, fann_train_data* train, unsigned int trainingAlgorithm, unsigned int epochCount) { fann *gpunn = fann_copy(ann); gpunn->training_algorithm = (fann_train_enum)trainingAlgorithm; { cudaEvent_t start, stop; float time; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); gpuann_fann_parallel_train_on_data(gpunn, train, epochCount); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); cudaEventDestroy(start); cudaEventDestroy(stop); printf("%10.5f ", time); } fann_destroy(gpunn); }
int main() { cudaEvent_t start; cudaEvent_t end; float duration; const float overestimateRate = 0.01f; const float errorRate = 0.01f; Tokenizer tokenizer( overestimateRate, errorRate ); /************** Test counting string tokens *************/ TextReader reader; cudaEventCreate( &start ); cudaEventRecord( start, 0 ); reader.Read(); tokenizer.StartTokenizing( reader.GetCharBuffer(), reader.GetOffsetBuffer(), reader.GetCharBufferSize(), reader.GetOffsetBufferSize() ); cudaEventCreate( &end ); cudaEventRecord( end, 0 ); cudaEventSynchronize( end ); cudaEventElapsedTime( &duration, start, end ); printf( "Time taken: %.3lf milliseconds\n", duration ); tokenizer.GetFrequency( "a" ); }
float bench::ClockBenchmark::_determineCycleTime() { cudaEvent_t start, end; check( cudaEventCreate(&start) ); check( cudaEventCreate(&end) ); unsigned long long elapsedCycles; unsigned long long* deviceElapsedCycles; long long int* deviceDummyMem; const dim3 grid(1,1,1), block(1,1,1); check( cudaMalloc((void**)&deviceElapsedCycles, sizeof(unsigned long long)) ); check( cudaMalloc((void**)&deviceDummyMem, sizeof(long long int)) ); check( cudaEventRecord(start) ); cudaDetermineCycleTimeWrapper(deviceElapsedCycles, deviceDummyMem, grid, block); check( cudaEventRecord(end) ); check( cudaDeviceSynchronize() ); check( cudaMemcpy(&elapsedCycles, deviceElapsedCycles, sizeof(unsigned long long), cudaMemcpyDeviceToHost) ); float elapsedTime = 0; check( cudaEventElapsedTime(&elapsedTime, start, end) ); report(util::Indents(2) << "elapsed time: " << elapsedTime << "ms"); report(util::Indents(2) << "elapsed cycles: " << elapsedCycles); return elapsedTime * 1000000.0 / (float)elapsedCycles; }
void runCuda() { ////////////////////// // Timing cuda call // ////////////////////// float time; cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); // Map OpenGL buffer object for writing from CUDA on a single GPU // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not use this buffer dptr=NULL; vbo = mesh->getVBO(); vbosize = mesh->getVBOsize(); nbo = mesh->getNBO(); nbosize = mesh->getNBOsize(); #if RGBONLY == 1 float newcbo[] = {0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0}; cbo = newcbo; cbosize = 9; #elif RGBONLY == 0 vec3 defaultColor(0.5f, 0.5f, 0.5f); mesh->changeColor(defaultColor); cbo = mesh->getCBO(); cbosize = mesh->getCBOsize(); #endif ibo = mesh->getIBO(); ibosize = mesh->getIBOsize(); cudaGLMapBufferObject((void**)&dptr, pbo); updateCamera(); cudaRasterizeCore(cam, dptr, glm::vec2(width, height), frame, vbo, vbosize, cbo, cbosize, ibo, ibosize, nbo, nbosize, lights, lightsize, alpha, beta, displayMode); cudaGLUnmapBufferObject(pbo); vbo = NULL; cbo = NULL; ibo = NULL; frame++; fpstracker++; ////////////////////// // Timing cuda call // ////////////////////// cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&time, start, stop); printf("runCuda runtime: %3.1f ms \n", time); }
void contractTT(sTensorGPU *TT1, sTensorGPU *TT2, const int n, const int size) { cublasHandle_t handle; cublasCreate(&handle); type result=0; sTensorGPU temp1 = emptyTensor(size*size,2); sTensorGPU temp2 = emptyTensor(size*size*2,3); cudaEvent_t start; cudaEventCreate(&start); cudaEvent_t stop; cudaEventCreate(&stop); //printf("Start contractTT\n"); cudaEventRecord(start, NULL); int indA = TT1[0].size[0]; int indB = TT2[0].size[0]; sTensorCPU tt1start = copyToCPU(TT1[0]); sTensorCPU tt2start = copyToCPU(TT2[0]); sTensorCPU tt1end = copyToCPU(TT1[n - 1]); sTensorCPU tt2end = copyToCPU( TT2[n - 1]); for (int i = 0; i < indA; i++){ TT1[0] = prepareTensorStart(tt1start, i); TT1[n - 1] = prepareTensorEnd(tt1end, i); for (int j = 0; j < indB; j++){ TT2[0] = prepareTensorStart(tt2start, j); TT2[n - 1] = prepareTensorEnd(tt2end, j); contractTensor(handle, TT1[0], TT2[0], temp1); for (int i = 1; i < n; i++){ contractTensor(handle, temp1, TT1[i], temp2); contractTensor(handle, temp2, TT2[i], temp1, 2); } type add = 0; cudaMemcpy(&add, temp1.deviceData, sizeof(type), cudaMemcpyDeviceToHost); //printf("%e ", add); result += add; } } cudaEventRecord(stop, NULL); cudaEventSynchronize(stop); float msecTotal = 0.0f; cudaEventElapsedTime(&msecTotal, start, stop); printf("Time: %.3fms\n", msecTotal); printf("Ops: %.0f\n", bops); double gigaFlops = (bops * 1.0e-9f) / (msecTotal / 1000.0f); printf("Perf= %.2f GFlop/s\n", gigaFlops); cublasDestroy(handle); cudaDeviceReset(); printf("%.5e \n", result); exit(0); }
NVENCSTATUS NVEncFilter::filter(FrameInfo *pInputFrame, FrameInfo **ppOutputFrames, int *pOutputFrameNum) { cudaError_t cudaerr = cudaSuccess; if (m_bCheckPerformance) { cudaerr = cudaEventRecord(*m_peFilterStart.get()); if (cudaerr != cudaSuccess) { AddMessage(RGY_LOG_ERROR, _T("failed cudaEventRecord(m_peFilterStart): %s.\n"), char_to_tstring(cudaGetErrorString(cudaerr)).c_str()); } } if (pInputFrame == nullptr) { *pOutputFrameNum = 0; ppOutputFrames[0] = nullptr; } if (m_pParam && m_pParam->bOutOverwrite //上書きか? && pInputFrame != nullptr && pInputFrame->ptr != nullptr //入力が存在するか? && ppOutputFrames != nullptr && ppOutputFrames[0] == nullptr) { //出力先がセット可能か? ppOutputFrames[0] = pInputFrame; *pOutputFrameNum = 1; } const auto ret = run_filter(pInputFrame, ppOutputFrames, pOutputFrameNum); const int nOutFrame = *pOutputFrameNum; if (!m_pParam->bOutOverwrite && nOutFrame > 0) { if (m_nPathThrough & FILTER_PATHTHROUGH_TIMESTAMP) { if (nOutFrame != 1) { AddMessage(RGY_LOG_ERROR, _T("timestamp path through can only be applied to 1-in/1-out filter.\n")); return NV_ENC_ERR_INVALID_CALL; } else { ppOutputFrames[0]->timestamp = pInputFrame->timestamp; ppOutputFrames[0]->duration = pInputFrame->duration; } } for (int i = 0; i < nOutFrame; i++) { if (m_nPathThrough & FILTER_PATHTHROUGH_FLAGS) ppOutputFrames[i]->flags = pInputFrame->flags; if (m_nPathThrough & FILTER_PATHTHROUGH_PICSTRUCT) ppOutputFrames[i]->picstruct = pInputFrame->picstruct; } } if (m_bCheckPerformance) { cudaerr = cudaEventRecord(*m_peFilterFin.get()); if (cudaerr != cudaSuccess) { AddMessage(RGY_LOG_ERROR, _T("failed cudaEventRecord(m_peFilterFin): %s.\n"), char_to_tstring(cudaGetErrorString(cudaerr)).c_str()); } cudaerr = cudaEventSynchronize(*m_peFilterFin.get()); if (cudaerr != cudaSuccess) { AddMessage(RGY_LOG_ERROR, _T("failed cudaEventSynchronize(m_peFilterFin): %s.\n"), char_to_tstring(cudaGetErrorString(cudaerr)).c_str()); } float time_ms = 0.0f; cudaerr = cudaEventElapsedTime(&time_ms, *m_peFilterStart.get(), *m_peFilterFin.get()); if (cudaerr != cudaSuccess) { AddMessage(RGY_LOG_ERROR, _T("failed cudaEventElapsedTime(m_peFilterStart - m_peFilterFin): %s.\n"), char_to_tstring(cudaGetErrorString(cudaerr)).c_str()); } m_dFilterTimeMs += time_ms; m_nFilterRunCount++; } return ret; }
// execute kernel double dslashCUDA() { printfQuda("Executing %d kernel loops...\n", loops); fflush(stdout); if (test_type < 2) dirac->Tune(*cudaSpinorOut, *cudaSpinor, *tmp); else dirac->Tune(cudaSpinorOut->Even(), cudaSpinor->Even(), *tmp); cudaEvent_t start, end; cudaEventCreate(&start); cudaEventRecord(start, 0); cudaEventSynchronize(start); for (int i = 0; i < loops; i++) { switch (test_type) { case 0: if (transfer) { dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); } else { dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: case 2: if (transfer) { MatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } break; } } cudaEventCreate(&end); cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) printf("with ERROR: %s\n", cudaGetErrorString(stat)); printf("done.\n\n"); return secs; }
// execute kernel double dslashCUDA(int niter) { cudaEvent_t start, end; cudaEventCreate(&start); cudaEventCreate(&end); cudaEventRecord(start, 0); for (int i = 0; i < niter; i++) { switch (test_type) { case 0: if (transfer) { dslashQuda(spinorOut->V(), spinor->V(), &inv_param, parity); } else { //inv_param.input_location = QUDA_CUDA_FIELD_LOCATION; //inv_param.output_location = QUDA_CUDA_FIELD_LOCATION; //dslashQuda(cudaSpinorOut->V(), cudaSpinor->V(), &inv_param, parity); dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: case 2: if (transfer) { MatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } break; case 3: case 4: if (transfer) { MatDagMatQuda(spinorOut->V(), spinor->V(), &inv_param); } else { dirac->MdagM(*cudaSpinorOut, *cudaSpinor); } break; } } cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) printfQuda("with ERROR: %s\n", cudaGetErrorString(stat)); return secs; }
int main(int argc, char **argv) { // device memory real *psi_d, *z_d; size_t fSize = sizeof(real); /* grid dimensions */ unsigned int Nx = 513, Ny = 513; // omitting boundaries unsigned int nGridPoints = (Nx-2)*(Ny-2); cudaMalloc((void **) &psi_d, (nGridPoints+1)*fSize); cudaMalloc((void **) &z_d, (nGridPoints+1)*fSize); /* initialization */ fillArray(psi_d, 0.0, nGridPoints+1); fillArray(z_d, 1.0, nGridPoints+1); checkCudaError("Initialization of grid"); // for timing purposes cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); // start timer cudaEventRecord(start,0); /* Call the poisson solver, right hand side * is stored on the device in z_d (make sure the data * is copied from CPU to GPU!), result is stored in * psi_d (on the GPU/device). * Here NX-2 is the width of the grid's interior * (without the boundaries). */ cuPoisson((Nx-2), psi_d, z_d); // stop timer cudaEventRecord(stop,0); cudaEventSynchronize(stop); float computationTime; cudaEventElapsedTime(&computationTime, start, stop); printf("Computation time was %.5f seconds.\n\n", computationTime/1000.0); printf("Writing result to disk...\n"); // write result to file writeBinaryFile(Nx, Ny, psi_d, "data.dat"); printf("done\n"); return EXIT_SUCCESS; }
double dslashCUDA(int niter) { cudaEvent_t start, end; cudaEventCreate(&start); cudaEventRecord(start, 0); cudaEventSynchronize(start); for (int i = 0; i < niter; i++) { switch (test_type) { case 0: parity = QUDA_EVEN_PARITY; if (transfer){ //dslashQuda(spinorOdd, spinorEven, &inv_param, parity); } else { dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 1: parity = QUDA_ODD_PARITY; if (transfer){ //MatPCQuda(spinorOdd, spinorEven, &inv_param); } else { dirac->Dslash(*cudaSpinorOut, *cudaSpinor, parity); } break; case 2: if (transfer){ //MatQuda(spinorGPU, spinor, &inv_param); } else { dirac->M(*cudaSpinorOut, *cudaSpinor); } } } cudaEventCreate(&end); cudaEventRecord(end, 0); cudaEventSynchronize(end); float runTime; cudaEventElapsedTime(&runTime, start, end); cudaEventDestroy(start); cudaEventDestroy(end); double secs = runTime / 1000; //stopwatchReadSeconds(); // check for errors cudaError_t stat = cudaGetLastError(); if (stat != cudaSuccess) errorQuda("with ERROR: %s\n", cudaGetErrorString(stat)); return secs; }
// use_cuda_time = 1: use cudaEventElapsedTime() // or use getSystemTime() void test_2gpu(float *d_send_data, float *d_recv_data, int size, int id0, int id1, bool use_cuda_time) { if(use_cuda_time) { cudaEvent_t start_event, stop_event; float time_memcpy; // version I //cudaEventCreate(&start_event); //cudaEventCreate(&stop_event); //cudaEventRecord(start_event, 0); // version II int eventflags = cudaEventBlockingSync; cudaEventCreateWithFlags(&start_event, eventflags); cudaEventCreateWithFlags(&stop_event, eventflags); cudaEventRecord(start_event, 0); for(int i=0; i<CNT; i++) { cudaMemcpy(d_recv_data, d_send_data, size*sizeof(float), cudaMemcpyDeviceToDevice); } std::cout << "hello, use_cuda_time" << std::endl; cudaEventRecord(stop_event, 0); cudaEventSynchronize(stop_event); cudaEventElapsedTime(&time_memcpy, start_event, stop_event); // ms std::cout << "Time is " << time_memcpy/1000. << "s" << std::endl; std::cout << "GPU" << id0 << " ---> GPU" << id1 << " :" << WIDTH*HEIGHT*sizeof(float)*CNT*1000./(1024*1024*time_memcpy) << "MB/s" << std::endl; cudaEventDestroy(start_event); cudaEventDestroy(stop_event); } else { //cudaEvent_t start_event; //cudaEventCreate(&start_event); long long start = getSystemTime(); for(int i=0; i<CNT; i++) { cudaMemcpy(d_recv_data, d_send_data, size*sizeof(float), cudaMemcpyDeviceToDevice); //cudaMemcpyPeer(d_recv_data, id1, d_send_data, id0, size*sizeof(float)); } //cudaEventRecord(start_event, 0); //cudaEventSynchronize(start_event); long long end = getSystemTime(); std::cout << "Time is " << (end-start)/1000. << "s" << std::endl; std::cout << "GPU" << id0 << " ---> GPU" << id1 << " :" << WIDTH*HEIGHT*sizeof(float)*CNT*1000./(1024*1024*(end - start+1)) << "MB/s" << std::endl; } //WIDTH*HEIGHT*4.*CNT/(1000*(end - start)) << "Mb/s" << std::endl; }
float CCudaTimeMeasure::GetTimeout(bool bResetStart/* = false*/) { cudaCheckError(cudaEventRecord(m_ceStopEvent, m_csStreamID)); cudaCheckError(cudaEventSynchronize(m_ceStopEvent)); float fElapsedTime = 0.0f; cudaCheckError(cudaEventElapsedTime(&fElapsedTime, m_ceStartEvent, m_ceStopEvent)); if (bResetStart) { cudaCheckError(cudaEventRecord(m_ceStartEvent, m_csStreamID)); } return fElapsedTime; }
void _runBenchmark(int iterations) { // once without timing to prime the device if (!useCpu) { m_nbody->update(activeParams.m_timestep); } if (useCpu) { sdkCreateTimer(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventRecord(startEvent, 0)); } for (int i = 0; i < iterations; ++i) { m_nbody->update(activeParams.m_timestep); } float milliseconds = 0; if (useCpu) { sdkStopTimer(&timer); milliseconds = sdkGetTimerValue(&timer); sdkStartTimer(&timer); } else { checkCudaErrors(cudaEventRecord(stopEvent, 0)); checkCudaErrors(cudaEventSynchronize(stopEvent)); checkCudaErrors(cudaEventElapsedTime(&milliseconds, startEvent, stopEvent)); } double interactionsPerSecond = 0; double gflops = 0; computePerfStats(interactionsPerSecond, gflops, milliseconds, iterations); printf("%d bodies, total time for %d iterations: %.3f ms, mean %f\n", numBodies, iterations, milliseconds, milliseconds/iterations); printf("= %.3f billion interactions per second\n", interactionsPerSecond); printf("= %.3f %s-precision GFLOP/s at %d flops per interaction\n", gflops, (sizeof(T) > 4) ? "double" : "single", flopsPerInteraction); }
void OneBodyJastrowOrbitalBspline::calcGradient (MCWalkerConfiguration &W, int iat, vector<GradType> &grad) { CudaReal sim_cell_radius = W.Lattice.SimulationCellRadius; vector<Walker_t*> &walkers = W.WalkerList; if (OneGradHost.size() < OHMMS_DIM*walkers.size()) { OneGradHost.resize (walkers.size()*OHMMS_DIM); OneGradGPU.resize (walkers.size()*OHMMS_DIM, 1.25); } bool zero = true; for (int group=0; group<NumCenterGroups; group++) { int first = CenterFirst[group]; int last = CenterLast[group]; if (GPUSplines[group]) { CudaSpline<CudaReal> &spline = *(GPUSplines[group]); if (UsePBC) one_body_gradient_PBC (W.RList_GPU.data(), iat, C.data(), first, last, spline.coefs.data(), spline.coefs.size(), spline.rMax, L.data(), Linv.data(), sim_cell_radius, zero, OneGradGPU.data(), walkers.size()); else one_body_gradient (W.RList_GPU.data(), iat, C.data(), first, last, spline.coefs.data(), spline.coefs.size(), spline.rMax, zero, OneGradGPU.data(), walkers.size()); zero = false; } } // Copy data back to CPU memory gpu::streamsSynchronize(); OneGradHost.asyncCopy(OneGradGPU); cudaEventRecord(gpu::gradientSyncOneBodyEvent, gpu::memoryStream); }
/* * Starts the CUDA timer for the given CUDA event. * * Returns EXIT_SUCCESS or EXIT_FAILURE. */ int start_cuda_timer_ev( cudaEvent_t timing_event ) { #if NMFGPU_PROFILING_TRANSF || NMFGPU_PROFILING_KERNELS cudaError_t cuda_status = cudaSuccess; // ---------------------- /* Waits for *ALL* operations. * NOTE: The CPU thread will block or spin according to flags * specified in init_GPU(). */ cuda_status = cudaDeviceSynchronize(); if ( cuda_status != cudaSuccess ) { print_error( sys_error_shown_by_all, "CUDA Error detected: %s\n", cudaGetErrorString(cuda_status) ); return EXIT_FAILURE; } // Registers the current "timestamp". cuda_status = cudaEventRecord( timing_event, 0 ); if ( cuda_status != cudaSuccess ) { print_error( sys_error_shown_by_all, "Error recording a CUDA event: %s\n", cudaGetErrorString(cuda_status) ); return EXIT_FAILURE; } #endif /* if NMFGPU_PROFILING_TRANSF || NMFGPU_PROFILING_KERNELS */ return EXIT_SUCCESS; } // start_cuda_timer_ev
void stop() { if(!is_running_) { std::cerr << "error: timer is not running" << std::endl; return; } // if cudaEventRecord(custop_); } // stop()
void GPUDataTransferer<ElemType>::CopyGPUToCPUAsync(ElemType* gpuBuffer, size_t numElements, ElemType* cpuBuffer) { PrepareDevice(m_deviceId); cudaMemcpyAsync(cpuBuffer, gpuBuffer, numElements * sizeof(ElemType), cudaMemcpyDeviceToHost, m_fetchStream) || "cudaMemcpyAsync failed"; cudaEventRecord(m_fetchCompleteEvent, m_fetchStream) || "cudaEventRecord failed"; }
void GPUDataTransferer<ElemType>::CopyCPUToGPUAsync(ElemType* cpuBuffer, size_t numElements, ElemType* gpuBuffer) { PrepareDevice(m_deviceId); cudaMemcpyAsync(gpuBuffer, cpuBuffer, numElements * sizeof(ElemType), cudaMemcpyHostToDevice, m_assignStream) || "cudaMemcpyAsync failed"; cudaEventRecord(m_assignCompleteEvent, m_assignStream) || "cudaEventRecord failed"; }
float TimerGPU::read() { cudaEventRecord(stop_, stream_); cudaEventSynchronize(stop_); float time; cudaEventElapsedTime(&time, start_, stop_); return time; }
unsigned int GetTimeMillis () { float elapsedTime; cudaEventRecord(timerStop,0); cudaEventSynchronize(timerStop); cudaEventElapsedTime(&elapsedTime, timerStart, timerStop); return (unsigned int)(elapsedTime); }
void CudaTimer::Stop(cudaStream_t stream) { assert(started); cudaEventRecord(stop, stream); stopped = true; started = false; }
unsigned int StartTimer () { cudaEventCreate(&timerStart); cudaEventCreate(&timerStop); cudaEventRecord(timerStart,0); return 0; }
//----------------------------------------------------------------------------- void CUDA::Timer::Stop () { cudaEventRecord(mStop, 0); cudaEventSynchronize(mStop); cudaEventElapsedTime(&mTime, mStart, mStop); mState = CT_STOPPED; }
//----------------------------------------------------------------------------// double CUDAImpl::_StopTimer() { cudaEventRecord(_stop, 0); cudaEventSynchronize(_stop); float time; cudaEventElapsedTime(&time, _start, _stop); return time; }
double CudaTimer::Split() { cudaEventRecord(end); cudaDeviceSynchronize(); float t; cudaEventElapsedTime(&t, start, end); start.Swap(end); return (t / 1000.0); }
/** * record event in a device stream * * @param stream native cuda stream */ void recordEvent(cudaStream_t stream) { /* disallow double recording */ assert(isRecorded==false); isRecorded = true; this->stream = stream; CUDA_CHECK(cudaEventRecord(event, stream)); }
CUDATimer() { start_ = 0.0; stop_ = 0.0; elapsed_ = 0.0; is_running_ = false; cudaEventCreate(&custart_); cudaEventCreate(&custop_); cudaEventCreate(&cubase_); cudaEventRecord(cubase_); } // CUDATimer()