예제 #1
0
파일: main.cpp 프로젝트: VcDevel/Vc
template <size_t N, typename F> Vc_ALWAYS_INLINE void benchmark(F &&f)
{
    TimeStampCounter tsc;
    auto cycles = tsc.cycles();
    cycles = 0x7fffffff;
    for (int i = 0; i < 100; ++i) {
        tsc.start();
        for (int j = 0; j < 10; ++j) {
            auto C = f();
            unused(C);
        }
        tsc.stop();
        cycles = std::min(cycles, tsc.cycles());
    }
    //std::cout << cycles << " Cycles for " << N *N *(N + N - 1) << " FLOP => ";
    std::cout << std::setw(19) << std::setprecision(3)
              << double(N * N * (N + N - 1) * 10) / cycles;
    //<< " FLOP/cycle (" << variant << ")\n";
}
예제 #2
0
파일: mandel.cpp 프로젝트: vitillo/vc
void MandelBase::run()
{
    while (!m_abort) {
        // first we copy the parameters to our local data so that the main main thread can give a
        // new task while we're working
        m_mutex.lock();
        // destination image, RGB is good - no need for alpha
        QImage image(m_size, QImage::Format_RGB32);
        float x = m_x;
        float y = m_y;
        float scale = m_scale;
        m_mutex.unlock();

        // benchmark the number of cycles it takes
        TimeStampCounter timer;
        timer.Start();

        // calculate the mandelbrot set/image
        mandelMe(image, x, y, scale, 255);

        timer.Stop();

        // if no new set was requested in the meantime - return the finished image
        if (!m_restart) {
            emit ready(image, timer.Cycles());
        }

        // wait for more work
        m_mutex.lock();
        if (!m_restart) {
            m_wait.wait(&m_mutex);
        }
        m_restart = false;
        m_mutex.unlock();
    }
}
예제 #3
0
int main()
{
    {
      float_v x_i(float_v::IndexType::IndexesFromZero());
      for ( unsigned int i = 0; i < x_points.vectorsCount(); ++i, x_i += float_v::Size ) {
        const float_v x = x_i * h;
        x_points.vector(i) = x;
        y_points.vector(i) = fu(x);
      }
    }

    dy_points = Vc::malloc<float, Vc::AlignOnVector>(N + float_v::Size - 1) + (float_v::Size - 1);

    double speedup;
    TimeStampCounter timer;

    { ///////// ignore this part - it only wakes up the CPU ////////////////////////////
        const float oneOver2h = 0.5f / h;

        // set borders explicit as up- or downdifferential
        dy_points[0] = (y_points[1] - y_points[0]) / h;
        // GCC auto-vectorizes the following loop. It is interesting to see that both Vc::Scalar and
        // Vc::SSE are faster, though.
        for ( int i = 1; i < N - 1; ++i) {
            dy_points[i] = (y_points[i + 1] - y_points[i - 1]) * oneOver2h;
        }
        dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h;
    } //////////////////////////////////////////////////////////////////////////////////

    {
        std::cout << "\n" << std::setw(60) << "Classical finite difference method" << std::endl;
        timer.Start();

        const float oneOver2h = 0.5f / h;

        // set borders explicit as up- or downdifferential
        dy_points[0] = (y_points[1] - y_points[0]) / h;
        // GCC auto-vectorizes the following loop. It is interesting to see that both Vc::Scalar and
        // Vc::SSE are faster, though.
        for ( int i = 1; i < N - 1; ++i) {
            dy_points[i] = (y_points[i + 1] - y_points[i - 1]) * oneOver2h;
        }
        dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h;

        timer.Stop();
        printResults();
        std::cout << "cycle count: " << timer.Cycles()
            << " | " << static_cast<double>(N * 2) / timer.Cycles() << " FLOP/cycle"
            << " | " << static_cast<double>(N * 2 * sizeof(float)) / timer.Cycles() << " Byte/cycle"
            << "\n";
    }

    speedup = timer.Cycles();
    {
        std::cout << std::setw(60) << "Vectorized finite difference method" << std::endl;
        timer.Start();

        // All the differentials require to calculate (r - l) / 2h, where we calculate 1/2h as a
        // constant before the loop to avoid unnecessary calculations. Note that a good compiler can
        // already do this for you.
        const float_v oneOver2h = 0.5f / h;

        // Calculate the left border
        dy_points[0] = (y_points[1] - y_points[0]) / h;

        // Calculate the differentials streaming through the y and dy memory. The picture below
        // should give an idea of what values in y get read and what values are written to dy in
        // each iteration:
        //
        // y  [...................................]
        //     00001111222233334444555566667777
        //       00001111222233334444555566667777
        // dy [...................................]
        //      00001111222233334444555566667777
        //
        // The loop is manually unrolled four times to improve instruction level parallelism and
        // prefetching on architectures where four vectors fill one cache line. (Note that this
        // unrolling breaks auto-vectorization of the Vc::Scalar implementation when compiling with
        // GCC.)
        for (unsigned int i = 0; i < (y_points.entriesCount() - 2) / float_v::Size; i += 4) {
            // Prefetches make sure the data which is going to be used in 24/4 iterations is already
            // in the L1 cache. The prefetchForOneRead additionally instructs the CPU to not evict
            // these cache lines to L2/L3.
            Vc::prefetchForOneRead(&y_points[(i + 24) * float_v::Size]);

            // calculate float_v::Size differentials per (left - right) / 2h
            const float_v dy0 = (y_points.vector(i + 0, 2) - y_points.vector(i + 0)) * oneOver2h;
            const float_v dy1 = (y_points.vector(i + 1, 2) - y_points.vector(i + 1)) * oneOver2h;
            const float_v dy2 = (y_points.vector(i + 2, 2) - y_points.vector(i + 2)) * oneOver2h;
            const float_v dy3 = (y_points.vector(i + 3, 2) - y_points.vector(i + 3)) * oneOver2h;

            // Use streaming stores to reduce the required memory bandwidth. Without streaming
            // stores the CPU would first have to load the cache line, where the store occurs, from
            // memory into L1, then overwrite the data, and finally write it back to memory. But
            // since we never actually need the data that the CPU fetched from memory we'd like to
            // keep that bandwidth free for real work. Streaming stores allow us to issue stores
            // which the CPU gathers in store buffers to form full cache lines, which then get
            // written back to memory directly without the costly read. Thus we make better use of
            // the available memory bandwidth.
            dy0.store(&dy_points[(i + 0) * float_v::Size + 1], Vc::Streaming);
            dy1.store(&dy_points[(i + 1) * float_v::Size + 1], Vc::Streaming);
            dy2.store(&dy_points[(i + 2) * float_v::Size + 1], Vc::Streaming);
            dy3.store(&dy_points[(i + 3) * float_v::Size + 1], Vc::Streaming);
        }

        // Process the last vector. Note that this works for any N because Vc::Memory adds padding
        // to y_points and dy_points such that the last scalar value is somewhere inside lastVector.
        // The correct right border value for dy_points is overwritten in the last step unless N is
        // a multiple of float_v::Size + 2.
        // y  [...................................]
        //                                  8888
        //                                    8888
        // dy [...................................]
        //                                   8888
        {
            const size_t i = y_points.vectorsCount() - 1;
            const float_v left = y_points.vector(i, -2);
            const float_v right = y_points.lastVector();
            ((right - left) * oneOver2h).store(&dy_points[i * float_v::Size - 1], Vc::Unaligned);
        }

        // ... and finally the right border
        dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h;

        timer.Stop();
        printResults();
        std::cout << "cycle count: " << timer.Cycles()
            << " | " << static_cast<double>(N * 2) / timer.Cycles() << " FLOP/cycle"
            << " | " << static_cast<double>(N * 2 * sizeof(float)) / timer.Cycles() << " Byte/cycle"
            << "\n";
    }
    speedup /= timer.Cycles();
    std::cout << "Speedup: " << speedup << "\n";

    Vc::free(dy_points - float_v::Size + 1);
    return 0;
}
예제 #4
0
파일: main.cpp 프로젝트: lduhem/Vc
void Baker::createImage()
{
    const int iHeight = m_image.height();
    const int iWidth  = m_image.width();

    // Parameters Begin
    const float S = 4.f;
    const float nSteps[2]   = {
        static_cast<float>(m_opt.steps[0] == -1 ? std::sqrt(iWidth) * iWidth : m_opt.steps[0]),
        static_cast<float>(m_opt.steps[1] == -1 ? std::sqrt(iHeight) * iHeight : m_opt.steps[1])
    };
    const int upperBound[3] = { m_opt.red[1], m_opt.green[1], m_opt.blue[1] };
    const int lowerBound[3] = { m_opt.red[0], m_opt.green[0], m_opt.blue[0] };
    int overallLowerBound = m_opt.it[0];
    int maxIterations = m_opt.it[1];// maxOf(maxOf(overallLowerBound, upperBound[0]), maxOf(upperBound[1], upperBound[2]));
    float realMin = -2.102613f;
    float realMax =  1.200613f;
    float imagMin = 0.f;
    float imagMax = 1.23971f;
    // Parameters End

    TimeStampCounter timer;
    timer.start();

    // helper constants
    const int overallUpperBound = maxOf(upperBound[0], maxOf(upperBound[1], upperBound[2]));
    const float maxX = static_cast<float>(iWidth ) - 1.f;
    const float maxY = static_cast<float>(iHeight) - 1.f;
    const float xFact = iWidth / m_width;
    const float yFact = iHeight / m_height;
    const float realStep = (realMax - realMin) / nSteps[0];
    const float imagStep = (imagMax - imagMin) / nSteps[1];

    Canvas canvas(iHeight, iWidth);
#ifdef Scalar
    for (float real = realMin; real <= realMax; real += realStep) {
        m_progress.setValue(99.f * (real - realMin) / (realMax - realMin));
        for (float imag = imagMin; imag <= imagMax; imag += imagStep) {
            Z c(real, imag);
            Z c2 = Z(1.08f * real + 0.15f, imag);
            if (fastNorm(Z(real + 1.f, imag)) < 0.06f || (std::real(c2) < 0.42f && fastNorm(c2) < 0.417f)) {
                continue;
            }
            Z z = c;
            int n;
            for (n = 0; n <= maxIterations && fastNorm(z) < S; ++n) {
                z = P(z, c);
            }
            if (n <= maxIterations && n >= overallLowerBound) {
                // point is outside of the Mandelbrot set and required enough (overallLowerBound)
                // iterations to reach the cut-off value S
                Z cn(real, -imag);
                Z zn = cn;
                z = c;
                for (int i = 0; i <= overallUpperBound; ++i) {
                    const float y2 = (std::imag(z) - m_y) * yFact;
                    const float yn2 = (std::imag(zn) - m_y) * yFact;
                    if (y2 >= 0.f && y2 < maxY && yn2 >= 0.f && yn2 < maxY) {
                        const float x2 = (std::real(z) - m_x) * xFact;
                        if (x2 >= 0.f && x2 < maxX) {
                            const int red   = (i >= lowerBound[0] && i <= upperBound[0]) ? 1 : 0;
                            const int green = (i >= lowerBound[1] && i <= upperBound[1]) ? 1 : 0;
                            const int blue  = (i >= lowerBound[2] && i <= upperBound[2]) ? 1 : 0;
                            canvas.addDot(x2, y2 , red, green, blue);
                            canvas.addDot(x2, yn2, red, green, blue);
                        }
                    }
                    z = P(z, c);
                    zn = P(zn, cn);
                    if (fastNorm(z) >= S) { // optimization: skip some useless looping
                        break;
                    }
                }
            }
        }
    }
#else
    const float imagStep2 = imagStep * float_v::Size;
    const float_v imagMin2 = imagMin + imagStep * static_cast<float_v>(int_v::IndexesFromZero());
    for (float real = realMin; real <= realMax; real += realStep) {
        m_progress.setValue(99.f * (real - realMin) / (realMax - realMin));
        for (float_v imag = imagMin2; all_of(imag <= imagMax); imag += imagStep2) {
            // FIXME: extra "tracks" if nSteps[1] is not a multiple of float_v::Size
            Z c(float_v(real), imag);
            Z c2 = Z(float_v(1.08f * real + 0.15f), imag);
            if (all_of(fastNorm(Z(float_v(real + 1.f), imag)) < 0.06f ||
                       (std::real(c2) < 0.42f && fastNorm(c2) < 0.417f))) {
                continue;
            }
            Z z = c;
            int_v n(Vc::Zero);
            int_m inside = fastNorm(z) < S;
            while (!(inside && n <= maxIterations).isEmpty()) {
                z = P(z, c);
                ++n(inside);
                inside &= fastNorm(z) < S;
            }
            inside |= n < overallLowerBound;
            if (inside.isFull()) {
                continue;
            }
            Z cn(float_v(real), -imag);
            Z zn = cn;
            z = c;
            for (int i = 0; i <= overallUpperBound; ++i) {
                const float_v y2 = (std::imag(z) - m_y) * yFact;
                const float_v yn2 = (std::imag(zn) - m_y) * yFact;
                const float_v x2 = (std::real(z) - m_x) * xFact;
                z = P(z, c);
                zn = P(zn, cn);
                const float_m drawMask = !inside && y2 >= 0.f && x2 >= 0.f && y2 < maxY && x2 < maxX && yn2 >= 0.f && yn2 < maxY;

                const int red   = (i >= lowerBound[0] && i <= upperBound[0]) ? 1 : 0;
                const int green = (i >= lowerBound[1] && i <= upperBound[1]) ? 1 : 0;
                const int blue  = (i >= lowerBound[2] && i <= upperBound[2]) ? 1 : 0;

                for(int j : where(drawMask)) {
                    canvas.addDot(x2[j], y2 [j], red, green, blue);
                    canvas.addDot(x2[j], yn2[j], red, green, blue);
                }
                if (all_of(fastNorm(z) >= S)) {  // optimization: skip some useless looping
                    break;
                }
            }
        }
    }
#endif
    canvas.toQImage(&m_image);

    timer.stop();
    m_progress.done();
    qDebug() << timer.cycles() << "cycles";

    if (m_filename.isEmpty()) {
        m_filename = QString("r%1-%2_g%3-%4_b%5-%6_s%7-%8_i%9-%10_%11x%12.png")
            .arg(lowerBound[0]).arg(upperBound[0])
            .arg(lowerBound[1]).arg(upperBound[1])
            .arg(lowerBound[2]).arg(upperBound[2])
            .arg(nSteps[0]).arg(nSteps[1])
            .arg(overallLowerBound).arg(maxIterations)
            .arg(m_image.width()).arg(m_image.height());
    }

    m_image.save(m_filename);
}