void test_vrsqrteQf32 (void) { float32x4_t out_float32x4_t; float32x4_t arg0_float32x4_t; out_float32x4_t = vrsqrteq_f32 (arg0_float32x4_t); }
inline float32x4_t cv_vrsqrtq_f32(float32x4_t val) { float32x4_t e = vrsqrteq_f32(val); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(e, e), val), e); return e; }
static float32x4_t vsqrtq_f32(float32x4_t s) { int i; float32x4_t x = vrsqrteq_f32(s); // Code to handle sqrt(0). // If the input to sqrtf() is zero, a zero will be returned. // If the input to vrsqrteq_f32() is zero, positive infinity is returned. const uint32x4_t vec_p_inf = vdupq_n_u32(0x7F800000); // check for divide by zero const uint32x4_t div_by_zero = vceqq_u32(vec_p_inf, vreinterpretq_u32_f32(x)); // zero out the positive infinity results x = vreinterpretq_f32_u32(vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(x))); // from arm documentation // The Newton-Raphson iteration: // x[n+1] = x[n] * (3 - d * (x[n] * x[n])) / 2) // converges to (1/√d) if x0 is the result of VRSQRTE applied to d. // // Note: The precision did not improve after 2 iterations. for (i = 0; i < 2; i++) { x = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, x), s), x); } // sqrt(s) = s * 1/sqrt(s) return vmulq_f32(s, x);; }
inline v_float32x4 v_invsqrt(const v_float32x4& x) { float32x4_t e = vrsqrteq_f32(x.val); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x.val, e), e), e); return v_float32x4(e); }
inline v_float32x4 v_sqrt(const v_float32x4& x) { float32x4_t x1 = vmaxq_f32(x.val, vdupq_n_f32(FLT_MIN)); float32x4_t e = vrsqrteq_f32(x1); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); e = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x1, e), e), e); return v_float32x4(vmulq_f32(x.val, e)); }
float32x4_t test_vrsqrteq_f32(float32x4_t in) { // CHECK-LABEL: @test_vrsqrteq_f32 // CHECK: call <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float> %in) return vrsqrteq_f32(in); }