Exemplo n.º 1
0
 Matrix &operator+=(const Matrix &rhs) {
     for (unsigned int i = 0; i < m_mem.vectorsCount(); ++i) {
         V v1(m_mem.vector(i));
         v1 += V(rhs.m_mem.vector(i));
         m_mem.vector(i) = v1;
     }
     return *this;
 }
Exemplo n.º 2
0
 Matrix &operator=(const T &val) {
     V vec(val);
     for (unsigned int i = 0; i < m_mem.vectorsCount(); ++i) {
         m_mem.vector(i) = vec;
     }
     return *this;
 }
Exemplo n.º 3
0
template<typename Vec> void testSort()
{
    typedef typename Vec::IndexType IndexType;

    const IndexType _ref(IndexesFromZero);
    Vec ref(_ref);
    Vec a;
    int maxPerm = 1;
    for (int x = Vec::Size; x > 0; --x) {
        maxPerm *= x;
    }
    for (int perm = 0; perm < maxPerm; ++perm) {
        int rest = perm;
        for (int i = 0; i < Vec::Size; ++i) {
            a[i] = 0;
            for (int j = 0; j < i; ++j) {
                if (a[i] == a[j]) {
                    ++(a[i]);
                    j = -1;
                }
            }
            a[i] += rest % (Vec::Size - i);
            rest /= (Vec::Size - i);
            for (int j = 0; j < i; ++j) {
                if (a[i] == a[j]) {
                    ++(a[i]);
                    j = -1;
                }
            }
        }
        //std::cout << a << a.sorted() << std::endl;
        COMPARE(ref, a.sorted()) << ", a: " << a;
    }

    for (int repetition = 0; repetition < 1000; ++repetition) {
        Vec test = Vec::Random();
        Vc::Memory<Vec, Vec::Size> reference;
        reference.vector(0) = test;
        std::sort(&reference[0], &reference[Vec::Size]);
        ref = reference.vector(0);
        COMPARE(ref, test.sorted());
    }
}
Exemplo n.º 4
0
int main()
{
    {
      float_v x_i(float_v::IndexType::IndexesFromZero());
      for ( unsigned int i = 0; i < x_points.vectorsCount(); ++i, x_i += float_v::Size ) {
        const float_v x = x_i * h;
        x_points.vector(i) = x;
        y_points.vector(i) = fu(x);
      }
    }

    dy_points = Vc::malloc<float, Vc::AlignOnVector>(N + float_v::Size - 1) + (float_v::Size - 1);

    double speedup;
    TimeStampCounter timer;

    { ///////// ignore this part - it only wakes up the CPU ////////////////////////////
        const float oneOver2h = 0.5f / h;

        // set borders explicit as up- or downdifferential
        dy_points[0] = (y_points[1] - y_points[0]) / h;
        // GCC auto-vectorizes the following loop. It is interesting to see that both Vc::Scalar and
        // Vc::SSE are faster, though.
        for ( int i = 1; i < N - 1; ++i) {
            dy_points[i] = (y_points[i + 1] - y_points[i - 1]) * oneOver2h;
        }
        dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h;
    } //////////////////////////////////////////////////////////////////////////////////

    {
        std::cout << "\n" << std::setw(60) << "Classical finite difference method" << std::endl;
        timer.Start();

        const float oneOver2h = 0.5f / h;

        // set borders explicit as up- or downdifferential
        dy_points[0] = (y_points[1] - y_points[0]) / h;
        // GCC auto-vectorizes the following loop. It is interesting to see that both Vc::Scalar and
        // Vc::SSE are faster, though.
        for ( int i = 1; i < N - 1; ++i) {
            dy_points[i] = (y_points[i + 1] - y_points[i - 1]) * oneOver2h;
        }
        dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h;

        timer.Stop();
        printResults();
        std::cout << "cycle count: " << timer.Cycles()
            << " | " << static_cast<double>(N * 2) / timer.Cycles() << " FLOP/cycle"
            << " | " << static_cast<double>(N * 2 * sizeof(float)) / timer.Cycles() << " Byte/cycle"
            << "\n";
    }

    speedup = timer.Cycles();
    {
        std::cout << std::setw(60) << "Vectorized finite difference method" << std::endl;
        timer.Start();

        // All the differentials require to calculate (r - l) / 2h, where we calculate 1/2h as a
        // constant before the loop to avoid unnecessary calculations. Note that a good compiler can
        // already do this for you.
        const float_v oneOver2h = 0.5f / h;

        // Calculate the left border
        dy_points[0] = (y_points[1] - y_points[0]) / h;

        // Calculate the differentials streaming through the y and dy memory. The picture below
        // should give an idea of what values in y get read and what values are written to dy in
        // each iteration:
        //
        // y  [...................................]
        //     00001111222233334444555566667777
        //       00001111222233334444555566667777
        // dy [...................................]
        //      00001111222233334444555566667777
        //
        // The loop is manually unrolled four times to improve instruction level parallelism and
        // prefetching on architectures where four vectors fill one cache line. (Note that this
        // unrolling breaks auto-vectorization of the Vc::Scalar implementation when compiling with
        // GCC.)
        for (unsigned int i = 0; i < (y_points.entriesCount() - 2) / float_v::Size; i += 4) {
            // Prefetches make sure the data which is going to be used in 24/4 iterations is already
            // in the L1 cache. The prefetchForOneRead additionally instructs the CPU to not evict
            // these cache lines to L2/L3.
            Vc::prefetchForOneRead(&y_points[(i + 24) * float_v::Size]);

            // calculate float_v::Size differentials per (left - right) / 2h
            const float_v dy0 = (y_points.vector(i + 0, 2) - y_points.vector(i + 0)) * oneOver2h;
            const float_v dy1 = (y_points.vector(i + 1, 2) - y_points.vector(i + 1)) * oneOver2h;
            const float_v dy2 = (y_points.vector(i + 2, 2) - y_points.vector(i + 2)) * oneOver2h;
            const float_v dy3 = (y_points.vector(i + 3, 2) - y_points.vector(i + 3)) * oneOver2h;

            // Use streaming stores to reduce the required memory bandwidth. Without streaming
            // stores the CPU would first have to load the cache line, where the store occurs, from
            // memory into L1, then overwrite the data, and finally write it back to memory. But
            // since we never actually need the data that the CPU fetched from memory we'd like to
            // keep that bandwidth free for real work. Streaming stores allow us to issue stores
            // which the CPU gathers in store buffers to form full cache lines, which then get
            // written back to memory directly without the costly read. Thus we make better use of
            // the available memory bandwidth.
            dy0.store(&dy_points[(i + 0) * float_v::Size + 1], Vc::Streaming);
            dy1.store(&dy_points[(i + 1) * float_v::Size + 1], Vc::Streaming);
            dy2.store(&dy_points[(i + 2) * float_v::Size + 1], Vc::Streaming);
            dy3.store(&dy_points[(i + 3) * float_v::Size + 1], Vc::Streaming);
        }

        // Process the last vector. Note that this works for any N because Vc::Memory adds padding
        // to y_points and dy_points such that the last scalar value is somewhere inside lastVector.
        // The correct right border value for dy_points is overwritten in the last step unless N is
        // a multiple of float_v::Size + 2.
        // y  [...................................]
        //                                  8888
        //                                    8888
        // dy [...................................]
        //                                   8888
        {
            const size_t i = y_points.vectorsCount() - 1;
            const float_v left = y_points.vector(i, -2);
            const float_v right = y_points.lastVector();
            ((right - left) * oneOver2h).store(&dy_points[i * float_v::Size - 1], Vc::Unaligned);
        }

        // ... and finally the right border
        dy_points[N - 1] = (y_points[N - 1] - y_points[N - 2]) / h;

        timer.Stop();
        printResults();
        std::cout << "cycle count: " << timer.Cycles()
            << " | " << static_cast<double>(N * 2) / timer.Cycles() << " FLOP/cycle"
            << " | " << static_cast<double>(N * 2 * sizeof(float)) / timer.Cycles() << " Byte/cycle"
            << "\n";
    }
    speedup /= timer.Cycles();
    std::cout << "Speedup: " << speedup << "\n";

    Vc::free(dy_points - float_v::Size + 1);
    return 0;
}