Ejemplo n.º 1
0
    inline static void updateLineX(HOOD_NEW& hoodNew, int indexEnd, HOOD_OLD& hoodOld, unsigned /* nanoStep */)
    {
        typedef LibFlatArray::short_vec<double, C> ShortVec;

        for (; hoodNew.index() < indexEnd; hoodNew += C, ++hoodOld) {
            ShortVec x = &hoodOld->x();
            ShortVec y = &hoodOld->y();
            ShortVec cReal = &hoodOld->cReal();
            ShortVec cImag = &hoodOld->cImag();

            for (int i = 0; i < ITERATIONS; ++i) {
                ShortVec cRealOld = cReal;
                cReal = cReal * cReal - cImag * cImag;
                cImag = ShortVec(2.0) * cImag * cRealOld;
            }

            for (const auto& j: hoodOld.weights(0)) {
                ShortVec weights;
                ShortVec otherX;
                ShortVec otherY;
                weights.load_aligned(j.second());
                otherX.gather(&hoodOld->x(), j.first());
                otherY.gather(&hoodOld->y(), j.first());
                cReal += otherX * weights;
                cImag += otherY * weights;
            }

            &hoodNew->x() << x;
            &hoodNew->y() << y;
            &hoodNew->cReal() << cReal;
            &hoodNew->cImag() << cImag;
        }
    }
Ejemplo n.º 2
0
 static void updateLineX(HOOD_NEW& hoodNew, int indexEnd, HOOD_OLD& hoodOld, unsigned /* nanoStep */)
 {
     for (int i = hoodOld.index(); i < indexEnd; ++i, ++hoodOld) {
         ShortVec tmp;
         tmp.load_aligned(&hoodNew->sum() + i * C);
         for (const auto& j: hoodOld.weights(0)) {
             ShortVec weights;
             ShortVec values;
             weights.load_aligned(j.second());
             values.gather(&hoodOld->value(), j.first());
             tmp += values * weights;
         }
         tmp.store_aligned(&hoodNew->sum() + i * C);
     }
 }
Ejemplo n.º 3
0
 static void updateLineX(HOOD_NEW& hoodNew, int indexEnd, HOOD_OLD& hoodOld, unsigned /* nanoStep */)
 {
     for (int i = hoodOld.index(); i < indexEnd; ++i, ++hoodOld) {
         ShortVec tmp;
         tmp.load_aligned(&hoodNew->sum() + i * C);
         for (const auto& j: hoodOld.weights(0)) {
             ShortVec weights;
             ShortVec values;
             weights.load_aligned(j.second());
             // fixme: is this gahter actually correct? shouldn't we use offset 0 for the gather? see also hpxperformancetests/main.cpp
             values.gather(&hoodOld->value(), j.first());
             tmp += values * weights;
         }
         tmp.store_aligned(&hoodNew->sum() + i * C);
     }
 }
Ejemplo n.º 4
0
void testImplementationInt()
{
    typedef short_vec<CARGO, ARITY> ShortVec;
    const int numElements = ShortVec::ARITY * 10;

    std::vector<CARGO> vec1(numElements);
    std::vector<CARGO> vec2(numElements, 4711);

    // init vec1:
    for (int i = 0; i < numElements; ++i) {
        vec1[i] = i;
    }

    // test default c-tor:
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST(4711 == vec2[i]);
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v;
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST(0 == vec2[i]);
    }

    // tests vector load/store:
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST_EQ(i, vec2[i]);
    }

    // tests scalar load, vector add:
    ShortVec w = vec1[1];

    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        &vec2[i] << (v + w);
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST_EQ((i + 1), vec2[i]);
    }

    // tests +=
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        v += w;
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST_EQ((2 * i + 1), vec2[i]);
    }

    // test -
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        &vec2[i] << (v - w);
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST_EQ((-i - 1), vec2[i]);
    }

    // test -=
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        v -= w;
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST_EQ((2 * i + 1), vec2[i]);
    }

    // test *
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        &vec2[i] << (v * w);
    }
    for (int i = 0; i < numElements; ++i) {
        int reference = (i * (2 * i + 1));
        BOOST_TEST_EQ(reference, vec2[i]);
    }

    // test *=
    for (int i = 0; i < numElements; ++i) {
        vec2[i] = i + 2;
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        v *= w;
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST_EQ(i * (i + 2), vec2[i]);
    }

    // test /
    for (int i = 0; i < numElements; ++i) {
        vec1[i] = 4 * (i + 1);
        vec2[i] = (i + 1);
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        &vec2[i] << (v / w);
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST_EQ(4, vec2[i]);
    }

    // test /=
    for (int i = 0; i < numElements; ++i) {
        vec1[i] = 4 * (i + 1);
        vec2[i] = (i + 1);
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        v /= w;
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST_EQ(4, vec2[i]);
    }

    // test sqrt()
    for (int i = 0; i < numElements; ++i) {
        vec1[i] = i * i;
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        &vec2[i] << sqrt(v);
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST_EQ(i, vec2[i]);
    }

    // test "/ sqrt()"
    for (int i = 0; i < numElements; ++i) {
        vec1[i] = (i + 1) * (i + 1);
        vec2[i] = (i + 1) * 2;
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        &vec2[i] << w / sqrt(v);
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST_EQ(2, vec2[i]);
    }

    // test string conversion
    for (int i = 0; i < ShortVec::ARITY; ++i) {
        vec1[i] = i + 5;
    }
    ShortVec v(&vec1[0]);
    std::ostringstream buf1;
    buf1 << v;

    std::ostringstream buf2;
    buf2 << "[";
    for (int i = 0; i < (ShortVec::ARITY - 1); ++i) {
        buf2 << (i + 5) << ", ";
    }
    buf2 << (ShortVec::ARITY - 1 + 5) << "]";

    BOOST_TEST(buf1.str() == buf2.str());

    // test gather
    {
        CARGO array[ARITY * 10];
        std::vector<int, aligned_allocator<int, 64> > indices(ARITY);
        CARGO actual[ARITY];
        CARGO expected[ARITY];
        std::memset(array, '\0', sizeof(CARGO) * ARITY * 10);

        for (int i = 0; i < ARITY * 10; ++i) {
            if (i % 10 == 0) {
                array[i] = i + 5;
            }
        }

        for (int i = 0; i < ARITY; ++i) {
            indices[i] = i * 10;
            expected[i] = (i * 10) + 5;
        }

        ShortVec vec;
        vec.gather(array, &indices[0]);
        actual << vec;

        for (int i = 0; i < ARITY; ++i) {
            BOOST_TEST_EQ(actual[i], expected[i]);
        }
    }

#ifdef LIBFLATARRAY_WITH_CPP14
    // test gather via initializer_list
    {
        CARGO actual1[ARITY];
        CARGO actual2[ARITY];
        CARGO expected[ARITY];
        for (int i = 0; i < ARITY; ++i) {
            expected[i] = (i * 10) + 5;
        }

        // max: 32
        ShortVec vec1 = { 5, 15, 25, 35, 45, 55, 65, 75,
                          85, 95, 105, 115, 125, 135, 145, 155,
                          165, 175, 185, 195, 205, 215, 225, 235,
                          245, 255, 265, 275, 285, 295, 305, 315 };
        ShortVec vec2;
        vec2 = { 5, 15, 25, 35, 45, 55, 65, 75,
                 85, 95, 105, 115, 125, 135, 145, 155,
                 165, 175, 185, 195, 205, 215, 225, 235,
                 245, 255, 265, 275, 285, 295, 305, 315 };
        actual1 << vec1;
        actual2 << vec2;
        for (int i = 0; i < ARITY; ++i) {
            BOOST_TEST_EQ(actual1[i], expected[i]);
            BOOST_TEST_EQ(actual2[i], expected[i]);
        }
    }
#endif

    // test scatter
    {
        ShortVec vec;
        CARGO array[ARITY * 10];
        CARGO expected[ARITY * 10];
        std::vector<int, aligned_allocator<int, 64> > indices(ARITY);
        std::memset(array,    '\0', sizeof(CARGO) * ARITY * 10);
        std::memset(expected, '\0', sizeof(CARGO) * ARITY * 10);
        for (int i = 0; i < ARITY * 10; ++i) {
            if (i % 10 == 0) {
                expected[i] = i + 5;
            }
        }
        for (int i = 0; i < ARITY; ++i) {
            indices[i] = i * 10;
        }

        vec.gather(expected, &indices[0]);
        vec.scatter(array, &indices[0]);
        for (int i = 0; i < ARITY * 10; ++i) {
            BOOST_TEST_EQ(array[i], expected[i]);
        }
    }

    // test non temporal stores
    {
        std::vector<CARGO, aligned_allocator<CARGO, 64> > array(ARITY);
        std::vector<CARGO, aligned_allocator<CARGO, 64> > expected(ARITY);

        for (int i = 0; i < ARITY; ++i) {
            expected[i] = 5;
        }
        ShortVec v1 = 5;
        v1.store_nt(&array[0]);
        for (int i = 0; i < ARITY; ++i) {
            BOOST_TEST_EQ(array[i], expected[i]);
        }

        for (int i = 0; i < ARITY; ++i) {
            expected[i] = i;
        }
        ShortVec v2 = &expected[0];
        v2.store_nt(&array[0]);
        for (int i = 0; i < ARITY; ++i) {
            BOOST_TEST_EQ(array[i], expected[i]);
        }
    }

    // test aligned stores
    {
        std::vector<CARGO, aligned_allocator<CARGO, 64> > array(ARITY);
        std::vector<CARGO, aligned_allocator<CARGO, 64> > expected(ARITY);

        for (int i = 0; i < ARITY; ++i) {
            expected[i] = 5;
        }
        ShortVec v1 = 5;
        v1.store_aligned(&array[0]);
        for (int i = 0; i < ARITY; ++i) {
            BOOST_TEST_EQ(array[i], expected[i]);
        }

        for (int i = 0; i < ARITY; ++i) {
            expected[i] = i;
        }
        ShortVec v2 = &expected[0];
        v2.store_aligned(&array[0]);
        for (int i = 0; i < ARITY; ++i) {
            BOOST_TEST_EQ(array[i], expected[i]);
        }
    }

    // test aligned loads
    {
        std::vector<CARGO, aligned_allocator<CARGO, 64> > array(ARITY);
        std::vector<CARGO, aligned_allocator<CARGO, 64> > expected(ARITY);

        for (int i = 0; i < ARITY; ++i) {
            array[i]    = i;
            expected[i] = 0;
        }
        ShortVec v1;
        v1.load_aligned(&array[0]);
        v1.store(&expected[0]);
        for (int i = 0; i < ARITY; ++i) {
            BOOST_TEST_EQ(array[i], expected[i]);
        }
    }
}
Ejemplo n.º 5
0
void testImplementationReal()
{
    typedef short_vec<CARGO, ARITY> ShortVec;
    int numElements = ShortVec::ARITY * 10;

    std::vector<CARGO> vec1(numElements);
    std::vector<CARGO> vec2(numElements, 4711);

    // init vec1:
    for (int i = 0; i < numElements; ++i) {
        vec1[i] = i + 0.1;
    }

    // test default c-tor:
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST(4711 == vec2[i]);
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v;
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        BOOST_TEST(0 == vec2[i]);
    }

    // tests vector load/store:
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        TEST_REAL((i + 0.1), vec2[i]);
    }

    // tests scalar load, vector add:
    ShortVec w = vec1[0];

    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        &vec2[i] << (v + w);
    }
    for (int i = 0; i < numElements; ++i) {
        TEST_REAL((i + 0.2), vec2[i]);
    }

    // tests +=
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        v += w;
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        TEST_REAL((2 * i + 0.3), vec2[i]);
    }

    // test -
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        &vec2[i] << (v - w);
    }
    for (int i = 0; i < numElements; ++i) {
        TEST_REAL((-i - 0.2), vec2[i]);
    }

    // test -=
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        v -= w;
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        TEST_REAL((2 * i + 0.3), vec2[i]);
    }

    // test *
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        &vec2[i] << (v * w);
    }
    for (int i = 0; i < numElements; ++i) {
        double reference = ((i + 0.1) * (2 * i + 0.3));
        TEST_REAL(reference, vec2[i]);
    }

    // test *=
    for (int i = 0; i < numElements; ++i) {
        vec2[i] = i + 0.2;
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        v *= w;
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        TEST_REAL((i + 0.1) * (i + 0.2), vec2[i]);
    }

    // test /
    for (int i = 0; i < numElements; ++i) {
        vec2[i] = i + 0.2;
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        &vec2[i] << (v / w);
    }
    for (int i = 0; i < numElements; ++i) {
        // accept lower accuracy for estimated division, really low
        // accuracy accepted because of results from ARM NEON:
        TEST_REAL_ACCURACY((i + 0.1) / (i + 0.2), vec2[i], 0.0025);
    }

    // test /=
    for (int i = 0; i < numElements; ++i) {
        vec2[i] = i + 0.2;
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        v /= w;
        &vec2[i] << v;
    }
    for (int i = 0; i < numElements; ++i) {
        // here, too, lower accuracy is acceptable. As with divisions,
        // ARM NEON costs us an order of magnitude here compared to X86.
        TEST_REAL_ACCURACY((i + 0.1) / (i + 0.2), vec2[i], 0.0025);
    }

    // test sqrt()
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        &vec2[i] << sqrt(v);
    }
    for (int i = 0; i < numElements; ++i) {
        // lower accuracy, mainly for ARM NEON
        TEST_REAL_ACCURACY(std::sqrt(double(i + 0.1)), vec2[i], 0.0025);
    }

    // test "/ sqrt()"
    for (int i = 0; i < numElements; ++i) {
        vec2[i] = i + 0.2;
    }
    for (int i = 0; i < (numElements - ShortVec::ARITY + 1); i += ShortVec::ARITY) {
        ShortVec v = &vec1[i];
        ShortVec w = &vec2[i];
        &vec2[i] << w / sqrt(v);
    }
    for (int i = 0; i < numElements; ++i) {
        // the expression "foo / sqrt(bar)" will again result in an
        // estimated result for single precision floats, so lower accuracy is acceptable:
        TEST_REAL_ACCURACY((i + 0.2) / std::sqrt(double(i + 0.1)), vec2[i], 0.0035);
    }

    // test string conversion
    for (int i = 0; i < ShortVec::ARITY; ++i) {
        vec1[i] = i + 0.1;
    }
    ShortVec v(&vec1[0]);
    std::ostringstream buf1;
    buf1 << v;

    std::ostringstream buf2;
    buf2 << "[";
    for (int i = 0; i < (ShortVec::ARITY - 1); ++i) {
        buf2 << (i + 0.1) << ", ";
    }
    buf2 << (ShortVec::ARITY - 1 + 0.1) << "]";

    BOOST_TEST(buf1.str() == buf2.str());

    // test gather
    {
        CARGO array[ARITY * 10];
        std::vector<int, aligned_allocator<int, 64> > indices(ARITY);
        CARGO actual[ARITY];
        CARGO expected[ARITY];
        std::memset(array, '\0', sizeof(CARGO) * ARITY * 10);

        for (int i = 0; i < ARITY * 10; ++i) {
            if (i % 10 == 0) {
                array[i] = i * 0.75;
            }
        }

        for (int i = 0; i < ARITY; ++i) {
            indices[i] = i * 10;
            expected[i] = (i * 10) * 0.75;
        }

        ShortVec vec;
        vec.gather(array, &indices[0]);
        actual << vec;

        for (int i = 0; i < ARITY; ++i) {
            TEST_REAL_ACCURACY(actual[i], expected[i], 0.001);
        }
    }

#ifdef LIBFLATARRAY_WITH_CPP14
    // test gather via initializer_list
    {
        CARGO actual1[ARITY];
        CARGO actual2[ARITY];
        CARGO expected[ARITY];
        for (int i = 0; i < ARITY; ++i) {
            expected[i] = (i * 10) * 0.75;
        }

        // max: 32
        ShortVec vec1 = { 0.0, 7.5, 15.0, 22.50, 30.0, 37.5, 45.0, 52.5,
                          60.0, 67.5, 75.0, 82.5, 90.0, 97.5, 105.0, 112.5,
                          120.0, 127.5, 135.0, 142.5, 150.0, 157.5, 165.0, 172.5,
                          180.0, 187.5, 195.0, 202.5, 210.0, 217.5, 225.0, 232.5 };
        ShortVec vec2;
        vec2 = { 0.0, 7.5, 15.0, 22.50, 30.0, 37.5, 45.0, 52.5,
                 60.0, 67.5, 75.0, 82.5, 90.0, 97.5, 105.0, 112.5,
                 120.0, 127.5, 135.0, 142.5, 150.0, 157.5, 165.0, 172.5,
                 180.0, 187.5, 195.0, 202.5, 210.0, 217.5, 225.0, 232.5 };
        actual1 << vec1;
        actual2 << vec2;
        for (int i = 0; i < ARITY; ++i) {
            TEST_REAL_ACCURACY(actual1[i], expected[i], 0.001);
            TEST_REAL_ACCURACY(actual2[i], expected[i], 0.001);
        }
    }
#endif

    // test scatter
    {
        ShortVec vec;
        CARGO array[ARITY * 10];
        CARGO expected[ARITY * 10];
        std::vector<int, aligned_allocator<int, 64> > indices(ARITY);
        std::memset(array,    '\0', sizeof(CARGO) * ARITY * 10);
        std::memset(expected, '\0', sizeof(CARGO) * ARITY * 10);
        for (int i = 0; i < ARITY * 10; ++i) {
            if (i % 10 == 0) {
                expected[i] = i * 0.75;
            }
        }
        for (int i = 0; i < ARITY; ++i) {
            indices[i] = i * 10;
        }

        vec.gather(expected, &indices[0]);
        vec.scatter(array, &indices[0]);
        for (int i = 0; i < ARITY * 10; ++i) {
            TEST_REAL_ACCURACY(array[i], expected[i], 0.001);
        }
    }

    // test non temporal stores
    {
        std::vector<CARGO, aligned_allocator<CARGO, 64> > array(ARITY);
        std::vector<CARGO, aligned_allocator<CARGO, 64> > expected(ARITY);

        for (int i = 0; i < ARITY; ++i) {
            expected[i] = 5.0;
        }
        ShortVec v1 = 5.0;
        v1.store_nt(&array[0]);
        for (int i = 0; i < ARITY; ++i) {
            TEST_REAL_ACCURACY(array[i], expected[i], 0.001);
        }

        for (int i = 0; i < ARITY; ++i) {
            expected[i] = i + 0.1;
        }
        ShortVec v2 = &expected[0];
        v2.store_nt(&array[0]);
        for (int i = 0; i < ARITY; ++i) {
            TEST_REAL_ACCURACY(array[i], expected[i], 0.001);
        }
    }

    // test aligned stores
    {
        std::vector<CARGO, aligned_allocator<CARGO, 64> > array(ARITY);
        std::vector<CARGO, aligned_allocator<CARGO, 64> > expected(ARITY);

        for (int i = 0; i < ARITY; ++i) {
            expected[i] = 5.0;
        }
        ShortVec v1 = 5.0;
        v1.store_aligned(&array[0]);
        for (int i = 0; i < ARITY; ++i) {
            TEST_REAL_ACCURACY(array[i], expected[i], 0.001);
        }

        for (int i = 0; i < ARITY; ++i) {
            expected[i] = i + 0.1;
        }
        ShortVec v2 = &expected[0];
        v2.store_aligned(&array[0]);
        for (int i = 0; i < ARITY; ++i) {
            TEST_REAL_ACCURACY(array[i], expected[i], 0.001);
        }
    }

    // test aligned loads
    {
        std::vector<CARGO, aligned_allocator<CARGO, 64> > array(ARITY);
        std::vector<CARGO, aligned_allocator<CARGO, 64> > expected(ARITY);

        for (int i = 0; i < ARITY; ++i) {
            array[i]    = i + 0.1;
            expected[i] = 0;
        }
        ShortVec v1;
        v1.load_aligned(&array[0]);
        v1.store(&expected[0]);
        for (int i = 0; i < ARITY; ++i) {
            TEST_REAL_ACCURACY(array[i], expected[i], 0.001);
        }
    }
}