//----------------------------------------------------------------------------------- void MathlibNEON::SinCos4( ArrayReal x, ArrayReal &outSin, ArrayReal &outCos ) { // TODO: Improve accuracy by mapping to the range [-pi/4, pi/4] and swap // between cos & sin depending on which quadrant it fell: // Quadrant | sin | cos // n = 0 -> sin( x ), cos( x ) // n = 1 -> cos( x ), -sin( x ) // n = 2 -> -sin( x ), -cos( x ) // n = 3 -> -sin( x ), sin( x ) // See ARGUMENT REDUCTION FOR HUGE ARGUMENTS: // Good to the Last Bit // K. C. Ng and themembers of the FP group of SunPro // http://www.derekroconnor.net/Software/Ng--ArgReduction.pdf // -- Perhaps we can leave this to GSoC students? -- // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayReal integralPart; x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI ); sincos_ps( x, &outSin, &outCos ); }
//----------------------------------------------------------------------------------- ArrayReal MathlibNEON::Cos4( ArrayReal x ) { // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayReal integralPart; x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI ); return cos_ps( x ); }
ArrayFloat MathlibSSE2::Cos4( ArrayFloat x ) { // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayFloat integralPart; x = _mm_add_ps( _mm_mul_ps( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = _mm_sub_ps( _mm_mul_ps( x, TWO_PI ), PI ); return cos_ps( x ); }