Exemple #1
0
void wallcycle_start(gmx_wallcycle_t wc, int ewc)
{
    gmx_cycles_t cycle;

    if (wc == NULL)
    {
        return;
    }

#ifdef GMX_MPI
    if (wc->wc_barrier)
    {
        MPI_Barrier(wc->mpi_comm_mygroup);
    }
#endif

    cycle = gmx_cycles_read();
    wc->wcc[ewc].start = cycle;
    if (wc->wcc_all != NULL)
    {
        wc->wc_depth++;
        if (ewc == ewcRUN)
        {
            wallcycle_all_start(wc,ewc,cycle);
        }
        else if (wc->wc_depth == 3)
        {
            wallcycle_all_stop(wc,ewc,cycle);
        }
    }
}
Exemple #2
0
double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
{
    gmx_cycles_t cycle,last;
    
    if (wc == NULL)
    {
        return 0;
    }
    
#ifdef GMX_MPI
    if (wc->wc_barrier)
    {
        MPI_Barrier(wc->mpi_comm_mygroup);
    }
#endif
    
    cycle = gmx_cycles_read();
    last = cycle - wc->wcc[ewc].start;
    wc->wcc[ewc].c += last;
    wc->wcc[ewc].n++;
    if (wc->wcc_all)
    {
        wc->wc_depth--;
        if (ewc == ewcRUN)
        {
            wallcycle_all_stop(wc,ewc,cycle);
        }
        else if (wc->wc_depth == 2)
        {
            wallcycle_all_start(wc,ewc,cycle);
        }
    }

    return last;
}
Exemple #3
0
void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
{
    if (wc != NULL)
    {
        wc->wcsc[ewcs].start = gmx_cycles_read();
    }
}
Exemple #4
0
void wallcycle_sub_start(gmx_wallcycle_t wc, int ewcs)
{
    if (useCycleSubcounters && wc != NULL)
    {
        wc->wcsc[ewcs].start = gmx_cycles_read();
    }
}
Exemple #5
0
void ddCloseBalanceRegionGpu(const gmx_domdec_t          *dd,
                             float                        waitGpuCyclesInCpuRegion,
                             DdBalanceRegionWaitedForGpu  waitedForGpu)
{
    BalanceRegion *reg = getBalanceRegion(dd);
    if (reg->isOpen)
    {
        GMX_ASSERT(reg->isOpenOnGpu, "Can not close a non-open GPU balance region");
        GMX_ASSERT(!reg->isOpenOnCpu, "The GPU region should be closed after closing the CPU region");

        float waitGpuCyclesEstimate = gmx_cycles_read() - reg->cyclesLastCpu;
        if (waitedForGpu == DdBalanceRegionWaitedForGpu::no)
        {
            /* The actual time could be anywhere between 0 and
             * waitCyclesEstimate. Using half is the best we can do.
             */
            const float unknownWaitEstimateFactor = 0.5f;
            waitGpuCyclesEstimate *= unknownWaitEstimateFactor;
        }

        float cyclesCpu = reg->cyclesLastCpu - reg->cyclesOpenCpu;
        dd_cycles_add(dd, cyclesCpu + waitGpuCyclesEstimate, ddCyclF);

        /* Register the total GPU wait time, to redistribute with GPU sharing */
        dd_cycles_add(dd, waitGpuCyclesInCpuRegion + waitGpuCyclesEstimate, ddCyclWaitGPU);

        /* Close the region */
        reg->isOpenOnGpu = false;
        reg->isOpen      = false;
    }
}
Exemple #6
0
void wallcycle_sub_stop(gmx_wallcycle_t wc, int ewcs)
{
    if (useCycleSubcounters && wc != NULL)
    {
        wc->wcsc[ewcs].c += gmx_cycles_read() - wc->wcsc[ewcs].start;
        wc->wcsc[ewcs].n++;
    }
}
Exemple #7
0
void ddReopenBalanceRegionCpu(const gmx_domdec_t *dd)
{
    BalanceRegion *reg = getBalanceRegion(dd);
    /* If the GPU is busy, don't reopen as we are overlapping with work */
    if (reg->isOpen && !reg->isOpenOnGpu)
    {
        reg->cyclesOpenCpu = gmx_cycles_read();
    }
}
Exemple #8
0
double wallcycle_stop(gmx_wallcycle_t wc, int ewc)
{
    gmx_cycles_t cycle, last;

    if (wc == NULL)
    {
        return 0;
    }

#if GMX_MPI
    if (wc->wc_barrier)
    {
        MPI_Barrier(wc->mpi_comm_mygroup);
    }
#endif

#ifdef DEBUG_WCYCLE
    debug_stop_check(wc, ewc);
#endif

    /* When processes or threads migrate between cores, the cycle counting
     * can get messed up if the cycle counter on different cores are not
     * synchronized. When this happens we expect both large negative and
     * positive cycle differences. We can detect negative cycle differences.
     * Detecting too large positive counts if difficult, since count can be
     * large, especially for ewcRUN. If we detect a negative count,
     * we will not print the cycle accounting table.
     */
    cycle                    = gmx_cycles_read();
    if (cycle >= wc->wcc[ewc].start)
    {
        last                 = cycle - wc->wcc[ewc].start;
    }
    else
    {
        last                 = 0;
        wc->haveInvalidCount = TRUE;
    }
    wc->wcc[ewc].c          += last;
    wc->wcc[ewc].n++;
    if (wc->wcc_all)
    {
        wc->wc_depth--;
        if (ewc == ewcRUN)
        {
            wallcycle_all_stop(wc, ewc, cycle);
        }
        else if (wc->wc_depth == 2)
        {
            wallcycle_all_start(wc, ewc, cycle);
        }
    }

    return last;
}
Exemple #9
0
void ddOpenBalanceRegionCpu(const gmx_domdec_t                    *dd,
                            DdAllowBalanceRegionReopen gmx_unused  allowReopen)
{
    BalanceRegion *reg = getBalanceRegion(dd);
    if (dd->comm->bRecordLoad)
    {
        GMX_ASSERT(allowReopen == DdAllowBalanceRegionReopen::yes || !reg->isOpen, "Should not open an already opened region");

        reg->cyclesOpenCpu = gmx_cycles_read();
        reg->isOpen        = true;
        reg->isOpenOnCpu   = true;
        reg->isOpenOnGpu   = false;
    }
}
Exemple #10
0
void ddCloseBalanceRegionCpu(const gmx_domdec_t *dd)
{
    BalanceRegion *reg = getBalanceRegion(dd);
    if (reg->isOpen && reg->isOpenOnCpu)
    {
        GMX_ASSERT(reg->isOpenOnCpu, "Can only close an open region");
        gmx_cycles_t cycles = gmx_cycles_read();
        reg->isOpenOnCpu    = false;

        if (reg->isOpenOnGpu)
        {
            /* Store the cycles for estimating the GPU/CPU overlap time */
            reg->cyclesLastCpu = cycles;
        }
        else
        {
            /* We can close the region */
            float cyclesCpu   = cycles - reg->cyclesOpenCpu;
            dd_cycles_add(dd, cyclesCpu, ddCyclF);
            reg->isOpen       = false;
        }
    }
}
Exemple #11
0
static gmx_cycles_t omp_cyc_end(gmx_cycles_t c)
{
    return gmx_cycles_read() - c;
}
Exemple #12
0
static gmx_cycles_t omp_cyc_start()
{
    return gmx_cycles_read();
}
Exemple #13
0
/*! \brief Calculate number of seconds per cycle tick on host
 *
 *  This routine runs a timer loop to calibrate the number of
 *  seconds per the units returned fro gmx_cycles_read().
 *
 *  \param  sampletime Minimum real sample time. It takes some trial-and-error
 *          to find the correct delay loop size, so the total runtime of
 *          this routine is about twice this time.
 *  \return Number of seconds per cycle unit. If it is not possible to
 *          calculate on this system (for whatever reason) the return value
 *          will be -1, so check that it is positive before using it.
 */
double
gmx_cycles_calibrate(double sampletime)
{
#ifdef _MSC_VER

    /* Windows does not have gettimeofday, but it provides a special
     * routine that returns the cycle counter frequency.
     */
    LARGE_INTEGER i;

    QueryPerformanceFrequency(&i);

    return 1.0/static_cast<double>(i.QuadPart);
    /* end of MS Windows implementation */

#elif HAVE_GETTIMEOFDAY

    /*  generic implementation with gettimeofday() */
    struct timeval t1, t2;
    gmx_cycles_t   c1, c2;
    double         timediff, cyclediff;
    double         d = 0.1; /* Dummy variable so we don't optimize away delay loop */

    if (!gmx_cycles_have_counter())
    {
        return -1;
    }

#if (defined(__alpha__) || defined(__alpha))
    /* Alpha cannot count to more than 4e9, but I don't expect
     * that the architecture will go over 2GHz before it dies, so
     * up to 2.0 seconds of sampling should be safe.
     */
    if (sampletime > 2.0)
    {
        sampletime = 2.0;
    }
#endif

    /* Start a timing loop. We want this to be largely independent
     * of machine speed, so we need to start with a very small number
     * of iterations and repeat it until we reach the requested time.
     *
     * We call gettimeofday an extra time at the start to avoid cache misses.
     */
    gettimeofday(&t1, nullptr);
    gettimeofday(&t1, nullptr);
    c1 = gmx_cycles_read();

    do
    {
        /* just a delay loop. To avoid optimizing it away, we calculate a number
         * that will underflow to zero in most cases. By conditionally adding it
         * to a result at the end it cannot be removed. n=10000 is arbitrary...
         */
        for (int i = 0; i < 10000; i++)
        {
            d = d/(1.0+static_cast<double>(i));
        }
        /* Read the time again */
        gettimeofday(&t2, nullptr);
        c2       = gmx_cycles_read();
        timediff = static_cast<double>(t2.tv_sec-t1.tv_sec)+(t2.tv_usec-t1.tv_usec)*1e-6;
    }
    while (timediff < sampletime);

    cyclediff = c2-c1;

    /* Add a very small result so the delay loop cannot be optimized away */
    if (d < 1e-30)
    {
        timediff += d;
    }

    /* Return seconds per cycle */
    return timediff/cyclediff;

#else
    /* No timing function available */
    return -1;
#endif
}