Esempio n. 1
0
void * avx512_fma(void *args_in)
{
    /* Thread input */
    struct thread_args *args;

    const int n_avx512 = VFMAPS_LATENCY;
    const __m512 add0 = _mm512_set1_ps((float) 1e-6);
    const __m512 mul0 = _mm512_set1_ps((float) (1. + 1e-6));
    __m512 r[n_avx512];

    // Declare as volatile to prevent removal during optimisation
    volatile float result;

    long r_max, i;
    int j;
    double runtime, flops;
    Stopwatch *t;

    /* Read inputs */
    args = (struct thread_args *) args_in;

    t = stopwatch_create(args->timer_type);

    for (j = 0; j < n_avx512; j++) {
        r[j] = _mm512_set1_ps((float) j);
    }

    /* Add over registers r0-r4, multiply over r5-r9, and rely on pipelining,
     * OOO execution, and latency difference (3 vs 5 cycles) for 2x FLOPs
     */

    runtime_flag = 0;
    r_max = 1;
    do {
        pthread_barrier_wait(&timer_barrier);
        t->start(t);
        for (i = 0; i < r_max; i++) {
            #pragma unroll(n_avx512)
            for (j = 0; j < n_avx512; j++)
                r[j] = _mm512_fmadd_ps(r[j], mul0, add0);
        }
        t->stop(t);
        runtime = t->runtime(t);

        /* Set runtime flag if any thread exceeds runtime limit */
        if (runtime > args->min_runtime) {
            pthread_mutex_lock(&runtime_mutex);
            runtime_flag = 1;
            pthread_mutex_unlock(&runtime_mutex);
        }

        pthread_barrier_wait(&timer_barrier);
        if (!runtime_flag) r_max *= 2;

    } while (!runtime_flag);

    /* In order to prevent removal of the prior loop by optimisers,
     * sum the register values and save the result as volatile. */

    for (j = 0; j < n_avx512; j++)
        r[0] = _mm512_add_ps(r[0], r[j]);
    result = reduce_AVX512(r[0]);

    /* (iter) * (16 instr / reg) * (2 flops / instr) * (n_avx512 reg / iter) */
    flops = r_max * 16 * 2 * n_avx512 / runtime;

    /* Cleanup */
    t->destroy(t);

    /* Thread output */
    args->runtime = runtime;
    args->flops = flops;
    args->bw_load = 0.;
    args->bw_store = 0.;

    pthread_exit(NULL);
}