void test_vrecpeQf32 (void) { float32x4_t out_float32x4_t; float32x4_t arg0_float32x4_t; out_float32x4_t = vrecpeq_f32 (arg0_float32x4_t); }
inline float32x4_t cv_vrecpq_f32(float32x4_t val) { float32x4_t reciprocal = vrecpeq_f32(val); reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(val, reciprocal), reciprocal); return reciprocal; }
inline v_float32x4 operator / (const v_float32x4& a, const v_float32x4& b) { float32x4_t reciprocal = vrecpeq_f32(b.val); reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); return v_float32x4(vmulq_f32(a.val, reciprocal)); }
inline v_float32x4& operator /= (v_float32x4& a, const v_float32x4& b) { float32x4_t reciprocal = vrecpeq_f32(b.val); reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); reciprocal = vmulq_f32(vrecpsq_f32(b.val, reciprocal), reciprocal); a.val = vmulq_f32(a.val, reciprocal); return a; }
static float32x4_t vdivq_f32(float32x4_t a, float32x4_t b) { int i; float32x4_t x = vrecpeq_f32(b); // from arm documentation // The Newton-Raphson iteration: // x[n+1] = x[n] * (2 - d * x[n]) // converges to (1/d) if x0 is the result of VRECPE applied to d. // // Note: The precision did not improve after 2 iterations. for (i = 0; i < 2; i++) { x = vmulq_f32(vrecpsq_f32(b, x), x); } // a/b = a*(1/b) return vmulq_f32(a, x); }
// http://stackoverflow.com/questions/6759897/ static inline __attribute__((always_inline)) float32x4_t reciprocal(float32x4_t x) { float32x4_t recip = vrecpeq_f32(x); recip = vmulq_f32(vrecpsq_f32(x, recip), recip); recip = vmulq_f32(vrecpsq_f32(x, recip), recip); return recip; }