void wallcycle_start(gmx_wallcycle_t wc, int ewc) { gmx_cycles_t cycle; if (wc == NULL) { return; } #ifdef GMX_MPI if (wc->wc_barrier) { MPI_Barrier(wc->mpi_comm_mygroup); } #endif cycle = gmx_cycles_read(); wc->wcc[ewc].start = cycle; if (wc->wcc_all != NULL) { wc->wc_depth++; if (ewc == ewcRUN) { wallcycle_all_start(wc,ewc,cycle); } else if (wc->wc_depth == 3) { wallcycle_all_stop(wc,ewc,cycle); } } }
double wallcycle_stop(gmx_wallcycle_t wc, int ewc) { gmx_cycles_t cycle,last; if (wc == NULL) { return 0; } #ifdef GMX_MPI if (wc->wc_barrier) { MPI_Barrier(wc->mpi_comm_mygroup); } #endif cycle = gmx_cycles_read(); last = cycle - wc->wcc[ewc].start; wc->wcc[ewc].c += last; wc->wcc[ewc].n++; if (wc->wcc_all) { wc->wc_depth--; if (ewc == ewcRUN) { wallcycle_all_stop(wc,ewc,cycle); } else if (wc->wc_depth == 2) { wallcycle_all_start(wc,ewc,cycle); } } return last; }
void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs) { if (wc != NULL) { wc->wcsc[ewcs].start = gmx_cycles_read(); } }
void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs) { if (useCycleSubcounters && wc != NULL) { wc->wcsc[ewcs].start = gmx_cycles_read(); } }
void ddCloseBalanceRegionGpu(const gmx_domdec_t *dd, float waitGpuCyclesInCpuRegion, DdBalanceRegionWaitedForGpu waitedForGpu) { BalanceRegion *reg = getBalanceRegion(dd); if (reg->isOpen) { GMX_ASSERT(reg->isOpenOnGpu, "Can not close a non-open GPU balance region"); GMX_ASSERT(!reg->isOpenOnCpu, "The GPU region should be closed after closing the CPU region"); float waitGpuCyclesEstimate = gmx_cycles_read() - reg->cyclesLastCpu; if (waitedForGpu == DdBalanceRegionWaitedForGpu::no) { /* The actual time could be anywhere between 0 and * waitCyclesEstimate. Using half is the best we can do. */ const float unknownWaitEstimateFactor = 0.5f; waitGpuCyclesEstimate *= unknownWaitEstimateFactor; } float cyclesCpu = reg->cyclesLastCpu - reg->cyclesOpenCpu; dd_cycles_add(dd, cyclesCpu + waitGpuCyclesEstimate, ddCyclF); /* Register the total GPU wait time, to redistribute with GPU sharing */ dd_cycles_add(dd, waitGpuCyclesInCpuRegion + waitGpuCyclesEstimate, ddCyclWaitGPU); /* Close the region */ reg->isOpenOnGpu = false; reg->isOpen = false; } }
void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs) { if (useCycleSubcounters && wc != NULL) { wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start; wc->wcsc[ewcs].n++; } }
void ddReopenBalanceRegionCpu(const gmx_domdec_t *dd) { BalanceRegion *reg = getBalanceRegion(dd); /* If the GPU is busy, don't reopen as we are overlapping with work */ if (reg->isOpen && !reg->isOpenOnGpu) { reg->cyclesOpenCpu = gmx_cycles_read(); } }
double wallcycle_stop(gmx_wallcycle_t wc, int ewc) { gmx_cycles_t cycle, last; if (wc == NULL) { return 0; } #if GMX_MPI if (wc->wc_barrier) { MPI_Barrier(wc->mpi_comm_mygroup); } #endif #ifdef DEBUG_WCYCLE debug_stop_check(wc, ewc); #endif /* When processes or threads migrate between cores, the cycle counting * can get messed up if the cycle counter on different cores are not * synchronized. When this happens we expect both large negative and * positive cycle differences. We can detect negative cycle differences. * Detecting too large positive counts if difficult, since count can be * large, especially for ewcRUN. If we detect a negative count, * we will not print the cycle accounting table. */ cycle = gmx_cycles_read(); if (cycle >= wc->wcc[ewc].start) { last = cycle - wc->wcc[ewc].start; } else { last = 0; wc->haveInvalidCount = TRUE; } wc->wcc[ewc].c += last; wc->wcc[ewc].n++; if (wc->wcc_all) { wc->wc_depth--; if (ewc == ewcRUN) { wallcycle_all_stop(wc, ewc, cycle); } else if (wc->wc_depth == 2) { wallcycle_all_start(wc, ewc, cycle); } } return last; }
void ddOpenBalanceRegionCpu(const gmx_domdec_t *dd, DdAllowBalanceRegionReopen gmx_unused allowReopen) { BalanceRegion *reg = getBalanceRegion(dd); if (dd->comm->bRecordLoad) { GMX_ASSERT(allowReopen == DdAllowBalanceRegionReopen::yes || !reg->isOpen, "Should not open an already opened region"); reg->cyclesOpenCpu = gmx_cycles_read(); reg->isOpen = true; reg->isOpenOnCpu = true; reg->isOpenOnGpu = false; } }
void ddCloseBalanceRegionCpu(const gmx_domdec_t *dd) { BalanceRegion *reg = getBalanceRegion(dd); if (reg->isOpen && reg->isOpenOnCpu) { GMX_ASSERT(reg->isOpenOnCpu, "Can only close an open region"); gmx_cycles_t cycles = gmx_cycles_read(); reg->isOpenOnCpu = false; if (reg->isOpenOnGpu) { /* Store the cycles for estimating the GPU/CPU overlap time */ reg->cyclesLastCpu = cycles; } else { /* We can close the region */ float cyclesCpu = cycles - reg->cyclesOpenCpu; dd_cycles_add(dd, cyclesCpu, ddCyclF); reg->isOpen = false; } } }
static gmx_cycles_t omp_cyc_end(gmx_cycles_t c) { return gmx_cycles_read() - c; }
static gmx_cycles_t omp_cyc_start() { return gmx_cycles_read(); }
/*! \brief Calculate number of seconds per cycle tick on host * * This routine runs a timer loop to calibrate the number of * seconds per the units returned fro gmx_cycles_read(). * * \param sampletime Minimum real sample time. It takes some trial-and-error * to find the correct delay loop size, so the total runtime of * this routine is about twice this time. * \return Number of seconds per cycle unit. If it is not possible to * calculate on this system (for whatever reason) the return value * will be -1, so check that it is positive before using it. */ double gmx_cycles_calibrate(double sampletime) { #ifdef _MSC_VER /* Windows does not have gettimeofday, but it provides a special * routine that returns the cycle counter frequency. */ LARGE_INTEGER i; QueryPerformanceFrequency(&i); return 1.0/static_cast<double>(i.QuadPart); /* end of MS Windows implementation */ #elif HAVE_GETTIMEOFDAY /* generic implementation with gettimeofday() */ struct timeval t1, t2; gmx_cycles_t c1, c2; double timediff, cyclediff; double d = 0.1; /* Dummy variable so we don't optimize away delay loop */ if (!gmx_cycles_have_counter()) { return -1; } #if (defined(__alpha__) || defined(__alpha)) /* Alpha cannot count to more than 4e9, but I don't expect * that the architecture will go over 2GHz before it dies, so * up to 2.0 seconds of sampling should be safe. */ if (sampletime > 2.0) { sampletime = 2.0; } #endif /* Start a timing loop. We want this to be largely independent * of machine speed, so we need to start with a very small number * of iterations and repeat it until we reach the requested time. * * We call gettimeofday an extra time at the start to avoid cache misses. */ gettimeofday(&t1, nullptr); gettimeofday(&t1, nullptr); c1 = gmx_cycles_read(); do { /* just a delay loop. To avoid optimizing it away, we calculate a number * that will underflow to zero in most cases. By conditionally adding it * to a result at the end it cannot be removed. n=10000 is arbitrary... */ for (int i = 0; i < 10000; i++) { d = d/(1.0+static_cast<double>(i)); } /* Read the time again */ gettimeofday(&t2, nullptr); c2 = gmx_cycles_read(); timediff = static_cast<double>(t2.tv_sec-t1.tv_sec)+(t2.tv_usec-t1.tv_usec)*1e-6; } while (timediff < sampletime); cyclediff = c2-c1; /* Add a very small result so the delay loop cannot be optimized away */ if (d < 1e-30) { timediff += d; } /* Return seconds per cycle */ return timediff/cyclediff; #else /* No timing function available */ return -1; #endif }