int main() { vec_float4 res_v; TEST_SET_START("164260798500","RUD", "roundf4"); //s=0 DEFINE_DATA(x1, 1.0, 1.0f) DEFINE_DATA(x2, -1.0,-1.0f) //s=-1 DEFINE_DATA(x3, 0.5, 1.0f) DEFINE_DATA(x4, -0.5, -1.0f) //s=-2 DEFINE_DATA(x5, 0.25, 0.0f) //s=-3 DEFINE_DATA(x6, 0.125, 0.0f) //s=0, e=128, f=7fffff --> s=0, e=128, f=7fffff DEFINE_DATA_UNSIGNED(x7,0x7fffffff,0x7fffffff) //s=0, e=-126, f=0 --> 0 DEFINE_DATA_UNSIGNED(x8, 0x800000,0x0) DEFINE_DATA(x9, 0.4999, 0.f) DEFINE_DATA(x10, 0.9999, 1.f) //TEST TEST_START("roundf4"); DO_TEST(x1,164260798501RUD) DO_TEST(x2,164260798502RUD) DO_TEST(x3,164260798503RUD) DO_TEST(x4,164260798504RUD) DO_TEST(x5,164260798505RUD) DO_TEST(x6,164260798506RUD) DO_TEST(x7,164260798507RUD) DO_TEST(x8,164260798508RUD) DO_TEST(x9,164260798509RUD) DO_TEST(x10,164260798510RUD) TEST_SET_DONE(); TEST_EXIT(); }
int main() { vec_int4 res_v; TEST_SET_START("921537538600","RNT", "irintf4"); /* Define original values and the results */ //s=0 DEFINE_DATA(x1, 1.0, 1) DEFINE_DATA(x2, -1.0,-1) //s=-1 DEFINE_DATA(x3, 0.5, 0) DEFINE_DATA(x4, -0.5, 0) //s=-2 DEFINE_DATA(x5, 0.25, 0) //s=-3 DEFINE_DATA(x6, 0.125, 0) //s=0, e=27, f=0 -> 134217728 DEFINE_DATA_UNSIGNED(x7, 0x4d000000,134217728) //s=0, e=-126, f=0 --> 0 DEFINE_DATA_UNSIGNED(x8, 0x800000,0) /* TEST */ TEST_START("irintf4"); DO_TEST(x1,921537538601RNT) DO_TEST(x2,921537538602RNT) DO_TEST(x3,921537538603RNT) DO_TEST(x4,921537538604RNT) DO_TEST(x5,921537538605RNT) DO_TEST(x6,921537538606RNT) DO_TEST(x7,921537538607RNT) DO_TEST(x8,921537538608RNT) TEST_SET_DONE(); TEST_EXIT(); }
// The input must be in domain [-1686629712, 1686629712]. // // I tried to optimize the double to int conversion by using `magic`, but // it was actually slower than using `_mm_cvttpd_epi32()` and it didn't // offer greater domain for `x`. static SIMD_INLINE __m128d sin_cephes_pd(__m128d x) { SIMD_CONST_SQ(sign , SIMD_UINT64_C(0x8000000000000000)); SIMD_CONST_SQ(inv_sign , SIMD_UINT64_C(0x7FFFFFFFFFFFFFFF)); SIMD_CONST_SI(int32_one, 1); SIMD_CONST_SD(4_DIV_PI , 1.27323954473516268615107010698); SIMD_CONST_SD(DP1 , 7.85398125648498535156e-1); SIMD_CONST_SD(DP2 , 3.77489470793079817668e-8); SIMD_CONST_SD(DP3 , 2.69515142907905952645e-15); #define DEFINE_DATA(name, x0, x1, x2, x3, x4, x5, xm, xa, y0, y1, y2, y3, y4, y5, ym, ya) \ SIMD_ALIGN_VAR(static const double, name[], 16) = { \ x0, x0, x1, x1, x2, x2, x3, x3, x4, x4, x5, x5, xm, xm, xa, xa, \ y0, x0, y1, x1, y2, x2, y3, x3, y4, x4, y5, x5, ym, xm, ya, xa, \ x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, xm, ym, xa, ya, \ y0, y0, y1, y1, y2, y2, y3, y3, y4, y4, y5, y5, ym, ym, ya, ya \ } DEFINE_DATA(sincos_coeff, 1.58962301576546568060e-10,-2.50507477628578072866e-8, 2.75573136213857245213e-6 ,-1.98412698295895385996e-4, 8.33333333332211858878e-3 ,-1.66666666666666307295e-1, 1.0, 0.0, -1.13585365213876817300e-11, 2.08757008419747316778e-9, -2.75573141792967388112e-7 , 2.48015872888517045348e-5, -1.38888888888730564116e-3 , 4.16666666666665929218e-2,-0.5, 1.0); __m128d y; __m128d sign = x; // Sign bit. x = _mm_and_pd(x, SIMD_GET_PD(inv_sign)); // Take the absolute value. y = _mm_mul_pd(x, SIMD_GET_PD(4_DIV_PI)); // Integer part of `x * 4 / PI`. __m128i ival = _mm_cvttpd_epi32(y); // Extract the integer part of y. __m128i ione = SIMD_GET_PI(int32_one); ival = _mm_add_epi32(ival, ione); // j += 1. ival = _mm_andnot_si128(ione, ival); // j &=~1. y = _mm_cvtepi32_pd(ival); ival = _mm_unpacklo_epi32(ival, ival); sign = _mm_xor_pd(sign, // Swap the sign bit if `j & 4`. _mm_castsi128_pd(_mm_slli_epi64(ival, 61))); sign = _mm_and_pd(sign, SIMD_GET_PD(sign)); // Keep only the sign bit. // Get the polynom selection mask (j & 2): // 1. `0x0000000000000000` => `0 <= x <= PI/4` // 2. `0xFFFFFFFFFFFFFFFF` => `PI/4 < x <= PI/2` ival = _mm_slli_epi32(ival, 30); ival = _mm_srai_epi32(ival, 31); // Extended precision modular arithmetic: // x = ((x - y * DP1) - y * DP2) - y * DP3 x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP1))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP2))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP3))); // Get the polynom coefficients for each lane (sin/cos). __m128d poly_mask = _mm_castsi128_pd(ival); const __m128d* coeff = reinterpret_cast<const __m128d*>(sincos_coeff) + static_cast<uintptr_t>(_mm_movemask_pd(poly_mask)) * 8; __m128d xx = _mm_mul_pd(x, x); y = coeff[0]; y = Simd128::mad(y, xx, coeff[1]); y = Simd128::mad(y, xx, coeff[2]); y = Simd128::mad(y, xx, coeff[3]); y = Simd128::mad(y, xx, coeff[4]); y = Simd128::mad(y, xx, coeff[5]); y = _mm_mul_pd(y, xx); __m128d x_or_xx = _mm_or_pd( _mm_and_pd(xx, poly_mask), _mm_andnot_pd(poly_mask, x)); y = _mm_mul_pd(y, x_or_xx); y = _mm_add_pd(y, _mm_mul_pd(x_or_xx, coeff[6])); y = _mm_add_pd(y, coeff[7]); return _mm_xor_pd(y, sign); }
int main() { vec_float4 res_v; TEST_SET_START("958726589700","NAR", "nextafterf4"); // == // 1.0f --> 1.0f DEFINE_DATA(x1, 1.0f, 1.0f, 0x3f800000) DEFINE_DATA(x2, 0.0f, 0.0f, 0x0) // * Icrement * // -FLT_MAX -> DEFINE_DATA_UNSIGNED(x3,0xffffffff, 0x0, 0xfffffffe) //(1, 40, 0) --> (1, 39, 7fffff) DEFINE_DATA_UNSIGNED(x4,0xd3800000, 0x0, 0xd37fffff) // (1,-40,0 ) --> (1,-41,0x7fffff) DEFINE_DATA_UNSIGNED(x5,0xab800000, 0x0, 0xab7fffff) //-FLT_MIN --> 0 DEFINE_DATA_UNSIGNED(x6,0x80800000, 0x0, 0x0) //0.0f --> FLT_MIN DEFINE_DATA(x7, 0.0f, 1.0f, 0x800000) //-0.0f --> FLT_MIN DEFINE_DATA_UNSIGNED(x8, 0x80000000, 0x7fffffff, 0x800000) //FLT_MIN --> DEFINE_DATA_UNSIGNED(x9, 0x800000, 0x7fffffff, 0x800001) // (0, -41, 7fffff) --> (0, -40, 0) DEFINE_DATA_UNSIGNED(x10, 0x2b7fffff, 0x7fffffff, 0x2b800000) // (0, 40, 7fffff) --> (0, 41, 0) DEFINE_DATA_UNSIGNED(x11, 0x53ffffff, 0x7fffffff, 0x54000000) // FLT_MAX --> DEFINE_DATA_UNSIGNED(x12,0x7fffffff,0x7fffffff,0x7fffffff) // * Decrement * // FLT_MAX --> FLT_MAX DEFINE_DATA_UNSIGNED(x13,0x7fffffff,0x7fffffff,0x7fffffff) // FLT_MAX --> DEFINE_DATA_UNSIGNED(x14,0x7fffffff,0x0,0x7ffffffe) // (0, 41, 0) --> (0, 40, 7fffff) DEFINE_DATA_UNSIGNED(x15, 0x54000000, 0x0, 0x53ffffff) // (0, -40, 0) --> (0, -41, 7fffff) DEFINE_DATA_UNSIGNED(x16, 0x2b800000,0x0, 0x2b7fffff) // -> FLT_MIN DEFINE_DATA_UNSIGNED(x17, 0x800001, 0x800000, 0x800000) // FLT_MIN --> 0 DEFINE_DATA_UNSIGNED(x18, 0x800000, 0x0, 0x0) // 0.0 -> -FLT_MIN DEFINE_DATA_UNSIGNED(x19, 0x0, 0xffffffff, 0x80800000) // -0.0 -> FLT_MIN DEFINE_DATA_UNSIGNED(x20, 0x80000000, 0xffffffff, 0x80800000) //-FLT_MIN --> DEFINE_DATA_UNSIGNED(x21, 0x80800000, 0xffffffff, 0x80800001) // (1,-41,0x7fffff) --> (1,-40,0 ) DEFINE_DATA_UNSIGNED(x22, 0xab7fffff, 0xffffffff, 0xab800000) //(1, 40, 0) --> (1, 39, 7fffff) DEFINE_DATA_UNSIGNED(x23, 0xd37fffff, 0xffffffff, 0xd3800000) // --> -FLT_MAX DEFINE_DATA_UNSIGNED(x24,0xfffffffe, 0xffffffff, 0xffffffff) //TEST TEST_START("nextafterf4"); DO_TEST(x1,958726589701NAR) DO_TEST(x2,958726589702NAR) DO_TEST(x3,958726589703NAR) DO_TEST(x4,958726589704NAR) DO_TEST(x5,958726589705NAR) DO_TEST(x6,958726589706NAR) DO_TEST(x7,958726589707NAR) DO_TEST(x8,958726589708NAR) DO_TEST(x9,958726589709NAR) DO_TEST(x10,958726589710NAR) DO_TEST(x11,958726589711NAR) DO_TEST(x12,958726589712NAR) DO_TEST(x13,958726589713NAR) DO_TEST(x14,958726589714NAR) DO_TEST(x15,958726589715NAR) DO_TEST(x16,958726589716NAR) DO_TEST(x17,958726589717NAR) DO_TEST(x18,958726589718NAR) DO_TEST(x19,958726589719NAR) DO_TEST(x20,958726589720NAR) DO_TEST(x21,958726589721NAR) DO_TEST(x22,958726589722NAR) DO_TEST(x23,958726589723NAR) DO_TEST(x24,958726589724NAR) TEST_SET_DONE(); TEST_EXIT(); }