void check_special_values() { V4SF vx; vx.f[0] = -1000; vx.f[1] = -100; vx.f[2] = 100; vx.f[3] = 1000; printf("exp("); print4(vx.v); printf(") = "); print4(exp_ps(vx.v)); printf("\n"); vx.f[0] = QNAN.f; vx.f[1] = PINF.f; vx.f[2] = MINF.f; vx.f[3] = QNAN2.f; printf("exp("); print4(vx.v); printf(") = "); print4(exp_ps(vx.v)); printf("\n"); vx.f[0] = 0; vx.f[1] = -10; vx.f[2] = 1e30f; vx.f[3] = 1e-42f; printf("log("); print4(vx.v); printf(") = "); print4(log_ps(vx.v)); printf("\n"); vx.f[0] = QNAN.f; vx.f[1] = PINF.f; vx.f[2] = MINF.f; vx.f[3] = QNAN2.f; printf("log("); print4(vx.v); printf(") = "); print4(log_ps(vx.v)); printf("\n"); printf("sin("); print4(vx.v); printf(") = "); print4(sin_ps(vx.v)); printf("\n"); printf("cos("); print4(vx.v); printf(") = "); print4(cos_ps(vx.v)); printf("\n"); vx.f[0] = -1e30; vx.f[1] = -100000; vx.f[2] = 1e30; vx.f[3] = 100000; printf("sin("); print4(vx.v); printf(") = "); print4(sin_ps(vx.v)); printf("\n"); printf("cos("); print4(vx.v); printf(") = "); print4(cos_ps(vx.v)); printf("\n"); }
// SIMD cos __SIMD _SIMD_cos_ps(__SIMD a) { #ifdef USE_SSE return cos_ps(a); #elif defined USE_AVX return cos256_ps(a); #endif }
float Cos(float angleRadians) { #ifdef MATH_USE_SINCOS_LOOKUPTABLE return cos_lookuptable(angleRadians); #elif defined(MATH_SSE2) // Do range reduction by 2pi before calling cos - this enchances precision of cos_ps a lot return s4f_x(cos_ps(modf_ps(setx_ps(angleRadians), pi2))); #else return cosf(angleRadians); #endif }
//----------------------------------------------------------------------------------- ArrayReal MathlibNEON::Cos4( ArrayReal x ) { // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayReal integralPart; x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI ); return cos_ps( x ); }
ArrayFloat MathlibSSE2::Cos4( ArrayFloat x ) { // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayFloat integralPart; x = _mm_add_ps( _mm_mul_ps( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = _mm_sub_ps( _mm_mul_ps( x, TWO_PI ), PI ); return cos_ps( x ); }
int check_sincos_precision(float xmin, float xmax) { unsigned nb_trials = 100000; printf("checking sines on [%g*Pi, %g*Pi]\n", xmin, xmax); float max_err_sin_ref = 0, max_err_sin_cep = 0, max_err_sin_x = 0; float max_err_cos_ref = 0, max_err_cos_cep = 0, max_err_cos_x = 0; float max_err_sum_sqr_test = 0; float max_err_sum_sqr_ref = 0; xmin *= M_PI; xmax *= M_PI; unsigned i; for (i=0; i < nb_trials; ++i) { V4SF vx, sin4, cos4, sin4_2, cos4_2; vx.f[0] = i*(xmax-xmin)/(nb_trials-1) + xmin; vx.f[1] = (i+.5)*(xmax-xmin)/(nb_trials-1) + xmin; vx.f[2] = frand()*(xmax-xmin); vx.f[3] = (i / 32)*M_PI/((i%32)+1); if (vx.f[3] < xmin || vx.f[3] > xmax) vx.f[3] = frand()*(xmax-xmin); /* vx.f[0] = M_PI/2; vx.f[1] = M_PI; vx.f[2] = M_PI/3; vx.f[3] = M_PI/4; */ sin4.v = sin_ps(vx.v); cos4.v = cos_ps(vx.v); sincos_ps(vx.v, &sin4_2.v, &cos4_2.v); unsigned j; for (j=0; j < 4; ++j) { float x = vx.f[j]; float sin_test = sin4.f[j]; float cos_test = cos4.f[j]; if (sin_test != sin4_2.f[j]) { printf("sin / sincos mismatch at x=%g\n", x); exit(1); return 1; } if (cos_test != cos4_2.f[j]) { printf("cos / sincos mismatch at x=%g\n", x); return 1; } float sin_ref = sinf(x); float sin_cep = cephes_sinf(x); float err_sin_ref = fabs(sin_ref - sin_test); float err_sin_cep = fabs(sin_cep - sin_test); if (err_sin_ref > max_err_sin_ref) { max_err_sin_ref = err_sin_ref; max_err_sin_x = x; } max_err_sin_cep = MAX(max_err_sin_cep, err_sin_cep); float cos_ref = cosf(x); float cos_cep = cephes_cosf(x); float err_cos_ref = fabs(cos_ref - cos_test); float err_cos_cep = fabs(cos_cep - cos_test); if (err_cos_ref > max_err_cos_ref) { max_err_cos_ref = err_cos_ref; max_err_cos_x = x; } max_err_cos_cep = MAX(max_err_cos_cep, err_cos_cep); float err_sum_sqr_test = fabs(1 - cos_test*cos_test - sin_test*sin_test); float err_sum_sqr_ref = fabs(1 - cos_ref*cos_ref - sin_ref*sin_ref); max_err_sum_sqr_ref = MAX(max_err_sum_sqr_ref, err_sum_sqr_ref); max_err_sum_sqr_test = MAX(max_err_sum_sqr_test, err_sum_sqr_test); //printf("sin(%g) = %g %g err=%g\n", x, sin_ref, sin_test, err_sin_ref); } } printf("max deviation from sinf(x): %g at %14.12g*Pi, max deviation from cephes_sin(x): %g\n", max_err_sin_ref, max_err_sin_x/M_PI, max_err_sin_cep); printf("max deviation from cosf(x): %g at %14.12g*Pi, max deviation from cephes_cos(x): %g\n", max_err_cos_ref, max_err_cos_x/M_PI, max_err_cos_cep); printf("deviation of sin(x)^2+cos(x)^2-1: %g (ref deviation is %g)\n", max_err_sum_sqr_test, max_err_sum_sqr_ref); if (max_err_sum_sqr_ref < 2e-7 && max_err_sin_ref < 2e-7 && max_err_cos_ref < 2e-7) { printf(" ->> precision OK for the sin_ps / cos_ps / sincos_ps <<-\n\n"); return 0; } else { printf("\n WRONG PRECISION !! there is a problem\n\n"); return 1; } }