Exemple #1
0
int main()
{
   vec_float4 res_v;

   TEST_SET_START("164260798500","RUD", "roundf4");
 

 
   //s=0
   DEFINE_DATA(x1, 1.0, 1.0f)
     DEFINE_DATA(x2, -1.0,-1.0f)
     //s=-1
     DEFINE_DATA(x3, 0.5, 1.0f)
     DEFINE_DATA(x4, -0.5, -1.0f)
     //s=-2
     DEFINE_DATA(x5, 0.25, 0.0f)
     //s=-3
     DEFINE_DATA(x6, 0.125, 0.0f)
     //s=0, e=128, f=7fffff  --> s=0, e=128, f=7fffff 
     DEFINE_DATA_UNSIGNED(x7,0x7fffffff,0x7fffffff)
     //s=0, e=-126, f=0 --> 0
     DEFINE_DATA_UNSIGNED(x8, 0x800000,0x0)
     DEFINE_DATA(x9, 0.4999, 0.f)
     DEFINE_DATA(x10, 0.9999, 1.f)

     //TEST
     TEST_START("roundf4");
     DO_TEST(x1,164260798501RUD)
     DO_TEST(x2,164260798502RUD)
     DO_TEST(x3,164260798503RUD)
     DO_TEST(x4,164260798504RUD)
     DO_TEST(x5,164260798505RUD)
     DO_TEST(x6,164260798506RUD)
     DO_TEST(x7,164260798507RUD)
     DO_TEST(x8,164260798508RUD)
     DO_TEST(x9,164260798509RUD)
     DO_TEST(x10,164260798510RUD)
     TEST_SET_DONE();

   
   TEST_EXIT();
}
Exemple #2
0
int main()
{
   vec_int4  res_v;

   TEST_SET_START("921537538600","RNT", "irintf4");

   /*  
       Define original values and  the results 
   */
     //s=0
   DEFINE_DATA(x1, 1.0, 1)
     DEFINE_DATA(x2, -1.0,-1)
     
     //s=-1
     DEFINE_DATA(x3, 0.5, 0)
     DEFINE_DATA(x4, -0.5, 0)
     //s=-2
     DEFINE_DATA(x5, 0.25, 0)
     //s=-3
     DEFINE_DATA(x6, 0.125, 0)

     //s=0, e=27, f=0  -> 134217728
     DEFINE_DATA_UNSIGNED(x7, 0x4d000000,134217728)
     //s=0, e=-126, f=0 --> 0
     DEFINE_DATA_UNSIGNED(x8, 0x800000,0)

   /*     TEST   */
   TEST_START("irintf4");
   
   DO_TEST(x1,921537538601RNT)
     DO_TEST(x2,921537538602RNT)
     DO_TEST(x3,921537538603RNT)
     DO_TEST(x4,921537538604RNT)
     DO_TEST(x5,921537538605RNT)
     DO_TEST(x6,921537538606RNT)
     DO_TEST(x7,921537538607RNT)
     DO_TEST(x8,921537538608RNT)
     
   TEST_SET_DONE();
 
   TEST_EXIT();
}
Exemple #3
0
// The input must be in domain [-1686629712, 1686629712].
//
// I tried to optimize the double to int conversion by using `magic`, but
// it was actually slower than using `_mm_cvttpd_epi32()` and it didn't
// offer greater domain for `x`.
static SIMD_INLINE __m128d sin_cephes_pd(__m128d x) {
    SIMD_CONST_SQ(sign     , SIMD_UINT64_C(0x8000000000000000));
    SIMD_CONST_SQ(inv_sign , SIMD_UINT64_C(0x7FFFFFFFFFFFFFFF));
    SIMD_CONST_SI(int32_one, 1);
    SIMD_CONST_SD(4_DIV_PI , 1.27323954473516268615107010698);
    SIMD_CONST_SD(DP1      , 7.85398125648498535156e-1);
    SIMD_CONST_SD(DP2      , 3.77489470793079817668e-8);
    SIMD_CONST_SD(DP3      , 2.69515142907905952645e-15);

#define DEFINE_DATA(name, x0, x1, x2, x3, x4, x5, xm, xa, y0, y1, y2, y3, y4, y5, ym, ya) \
  SIMD_ALIGN_VAR(static const double, name[], 16) = { \
    x0, x0, x1, x1, x2, x2, x3, x3, x4, x4, x5, x5, xm, xm, xa, xa, \
    y0, x0, y1, x1, y2, x2, y3, x3, y4, x4, y5, x5, ym, xm, ya, xa, \
    x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, xm, ym, xa, ya, \
    y0, y0, y1, y1, y2, y2, y3, y3, y4, y4, y5, y5, ym, ym, ya, ya  \
  }

    DEFINE_DATA(sincos_coeff,
                1.58962301576546568060e-10,-2.50507477628578072866e-8,
                2.75573136213857245213e-6 ,-1.98412698295895385996e-4,
                8.33333333332211858878e-3 ,-1.66666666666666307295e-1, 1.0, 0.0,

                -1.13585365213876817300e-11, 2.08757008419747316778e-9,
                -2.75573141792967388112e-7 , 2.48015872888517045348e-5,
                -1.38888888888730564116e-3 , 4.16666666666665929218e-2,-0.5, 1.0);

    __m128d y;
    __m128d sign = x;                                        // Sign bit.

    x = _mm_and_pd(x, SIMD_GET_PD(inv_sign));                // Take the absolute value.
    y = _mm_mul_pd(x, SIMD_GET_PD(4_DIV_PI));                // Integer part of `x * 4 / PI`.

    __m128i ival = _mm_cvttpd_epi32(y);                      // Extract the integer part of y.
    __m128i ione = SIMD_GET_PI(int32_one);

    ival = _mm_add_epi32(ival, ione);                        // j += 1.
    ival = _mm_andnot_si128(ione, ival);                     // j &=~1.

    y = _mm_cvtepi32_pd(ival);
    ival = _mm_unpacklo_epi32(ival, ival);

    sign = _mm_xor_pd(sign,                                  // Swap the sign bit if `j & 4`.
                      _mm_castsi128_pd(_mm_slli_epi64(ival, 61)));
    sign = _mm_and_pd(sign, SIMD_GET_PD(sign));              // Keep only the sign bit.

    // Get the polynom selection mask (j & 2):
    //   1. `0x0000000000000000` => `0    <= x <= PI/4`
    //   2. `0xFFFFFFFFFFFFFFFF` => `PI/4 <  x <= PI/2`
    ival = _mm_slli_epi32(ival, 30);
    ival = _mm_srai_epi32(ival, 31);

    // Extended precision modular arithmetic:
    //   x = ((x - y * DP1) - y * DP2) - y * DP3
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP1)));
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP2)));
    x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP3)));

    // Get the polynom coefficients for each lane (sin/cos).
    __m128d poly_mask = _mm_castsi128_pd(ival);
    const __m128d* coeff = reinterpret_cast<const __m128d*>(sincos_coeff) +
                           static_cast<uintptr_t>(_mm_movemask_pd(poly_mask)) * 8;

    __m128d xx = _mm_mul_pd(x, x);
    y = coeff[0];
    y = Simd128::mad(y, xx, coeff[1]);
    y = Simd128::mad(y, xx, coeff[2]);
    y = Simd128::mad(y, xx, coeff[3]);
    y = Simd128::mad(y, xx, coeff[4]);
    y = Simd128::mad(y, xx, coeff[5]);
    y = _mm_mul_pd(y, xx);

    __m128d x_or_xx = _mm_or_pd(
                          _mm_and_pd(xx, poly_mask),
                          _mm_andnot_pd(poly_mask, x));

    y = _mm_mul_pd(y, x_or_xx);
    y = _mm_add_pd(y, _mm_mul_pd(x_or_xx, coeff[6]));
    y = _mm_add_pd(y, coeff[7]);

    return _mm_xor_pd(y, sign);
}
Exemple #4
0
int main()
{
   vec_float4 res_v;

   TEST_SET_START("958726589700","NAR", "nextafterf4");
 


   // ==
   // 1.0f --> 1.0f
   DEFINE_DATA(x1, 1.0f, 1.0f, 0x3f800000)
     DEFINE_DATA(x2, 0.0f, 0.0f, 0x0)
     
     // *  Icrement *

     // -FLT_MAX -> 
     DEFINE_DATA_UNSIGNED(x3,0xffffffff, 0x0, 0xfffffffe)
     //(1, 40, 0) --> (1, 39, 7fffff)
     DEFINE_DATA_UNSIGNED(x4,0xd3800000, 0x0, 0xd37fffff)
     // (1,-40,0 ) --> (1,-41,0x7fffff)
     DEFINE_DATA_UNSIGNED(x5,0xab800000, 0x0, 0xab7fffff)
     //-FLT_MIN --> 0
     DEFINE_DATA_UNSIGNED(x6,0x80800000, 0x0, 0x0)
     //0.0f --> FLT_MIN
     DEFINE_DATA(x7, 0.0f, 1.0f, 0x800000)
     //-0.0f --> FLT_MIN
     DEFINE_DATA_UNSIGNED(x8, 0x80000000,  0x7fffffff, 0x800000)
     //FLT_MIN -->
     DEFINE_DATA_UNSIGNED(x9, 0x800000,  0x7fffffff, 0x800001)
     // (0, -41, 7fffff) --> (0, -40, 0)
     DEFINE_DATA_UNSIGNED(x10, 0x2b7fffff, 0x7fffffff, 0x2b800000)
     // (0, 40, 7fffff) --> (0, 41, 0)
     DEFINE_DATA_UNSIGNED(x11, 0x53ffffff, 0x7fffffff, 0x54000000)
     // FLT_MAX --> 
     DEFINE_DATA_UNSIGNED(x12,0x7fffffff,0x7fffffff,0x7fffffff)
     
     // * Decrement *

     // FLT_MAX --> FLT_MAX
     DEFINE_DATA_UNSIGNED(x13,0x7fffffff,0x7fffffff,0x7fffffff)
     // FLT_MAX --> 
     DEFINE_DATA_UNSIGNED(x14,0x7fffffff,0x0,0x7ffffffe)
     // (0, 41, 0) -->  (0, 40, 7fffff)
     DEFINE_DATA_UNSIGNED(x15,  0x54000000, 0x0, 0x53ffffff)
     // (0, -40, 0) -->  (0, -41, 7fffff)
     DEFINE_DATA_UNSIGNED(x16, 0x2b800000,0x0, 0x2b7fffff)
     // -> FLT_MIN
     DEFINE_DATA_UNSIGNED(x17,  0x800001, 0x800000, 0x800000)
     // FLT_MIN --> 0
     DEFINE_DATA_UNSIGNED(x18, 0x800000, 0x0, 0x0)
     // 0.0 -> -FLT_MIN
     DEFINE_DATA_UNSIGNED(x19, 0x0, 0xffffffff, 0x80800000)
     // -0.0 -> FLT_MIN
     DEFINE_DATA_UNSIGNED(x20, 0x80000000, 0xffffffff, 0x80800000)
     //-FLT_MIN -->
     DEFINE_DATA_UNSIGNED(x21, 0x80800000, 0xffffffff, 0x80800001)
     //  (1,-41,0x7fffff) --> (1,-40,0 ) 
     DEFINE_DATA_UNSIGNED(x22, 0xab7fffff, 0xffffffff, 0xab800000)
     //(1, 40, 0) --> (1, 39, 7fffff)
     DEFINE_DATA_UNSIGNED(x23, 0xd37fffff, 0xffffffff, 0xd3800000)
     // --> -FLT_MAX 
     DEFINE_DATA_UNSIGNED(x24,0xfffffffe, 0xffffffff, 0xffffffff)
     

     //TEST
     TEST_START("nextafterf4");
     DO_TEST(x1,958726589701NAR)
     DO_TEST(x2,958726589702NAR)
     DO_TEST(x3,958726589703NAR)
     DO_TEST(x4,958726589704NAR)
     DO_TEST(x5,958726589705NAR)
     DO_TEST(x6,958726589706NAR)
     DO_TEST(x7,958726589707NAR)
     DO_TEST(x8,958726589708NAR)
     DO_TEST(x9,958726589709NAR)
     DO_TEST(x10,958726589710NAR)
     DO_TEST(x11,958726589711NAR)
     DO_TEST(x12,958726589712NAR)
     DO_TEST(x13,958726589713NAR)
     DO_TEST(x14,958726589714NAR)
     DO_TEST(x15,958726589715NAR)
     DO_TEST(x16,958726589716NAR)
     DO_TEST(x17,958726589717NAR)
     DO_TEST(x18,958726589718NAR)
     DO_TEST(x19,958726589719NAR)
     DO_TEST(x20,958726589720NAR)
     DO_TEST(x21,958726589721NAR)
     DO_TEST(x22,958726589722NAR)
     DO_TEST(x23,958726589723NAR)
     DO_TEST(x24,958726589724NAR)

       TEST_SET_DONE();

   
   TEST_EXIT();
}