TheTest & test_float_math() { typedef typename V_RegTrait128<LaneType>::int_reg Ri; Data<R> data1, data2, data3; data1 *= 1.1; data2 += 10; R a1 = data1, a2 = data2, a3 = data3; Data<Ri> resB = v_round(a1), resC = v_trunc(a1), resD = v_floor(a1), resE = v_ceil(a1); Data<R> resF = v_magnitude(a1, a2), resG = v_sqr_magnitude(a1, a2), resH = v_muladd(a1, a2, a3); for (int i = 0; i < R::nlanes; ++i) { EXPECT_EQ(cvRound(data1[i]), resB[i]); EXPECT_EQ((typename Ri::lane_type)data1[i], resC[i]); EXPECT_EQ(cvFloor(data1[i]), resD[i]); EXPECT_EQ(cvCeil(data1[i]), resE[i]); EXPECT_COMPARE_EQ(std::sqrt(data1[i]*data1[i] + data2[i]*data2[i]), resF[i]); EXPECT_COMPARE_EQ(data1[i]*data1[i] + data2[i]*data2[i], resG[i]); EXPECT_COMPARE_EQ(data1[i]*data2[i] + data3[i], resH[i]); } return *this; }
void exp64f( const double *_x, double *y, int n ) { CV_INSTRUMENT_REGION(); const double* const expTab = cv::details::getExpTab64f(); const double A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0, A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0, A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0, A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0; int i = 0; const Cv64suf* x = (const Cv64suf*)_x; double minval = (-exp_max_val/exp_prescale); double maxval = (exp_max_val/exp_prescale); #if CV_SIMD_64F const int VECSZ = v_float64::nlanes; const v_float64 vprescale = vx_setall_f64(exp_prescale); const v_float64 vpostscale = vx_setall_f64(exp_postscale); const v_float64 vminval = vx_setall_f64(minval); const v_float64 vmaxval = vx_setall_f64(maxval); const v_float64 vA1 = vx_setall_f64(A1); const v_float64 vA2 = vx_setall_f64(A2); const v_float64 vA3 = vx_setall_f64(A3); const v_float64 vA4 = vx_setall_f64(A4); const v_float64 vA5 = vx_setall_f64(A5); const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); bool y_aligned = (size_t)(void*)y % 32 == 0; for( ; i < n; i += VECSZ*2 ) { if( i + VECSZ*2 > n ) { if( i == 0 || _x == y ) break; i = n - VECSZ*2; y_aligned = false; } v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f); xf0 = v_min(v_max(xf0, vminval), vmaxval); xf1 = v_min(v_max(xf1, vminval), vmaxval); xf0 *= vprescale; xf1 *= vprescale; v_int32 xi0 = v_round(xf0); v_int32 xi1 = v_round(xf1); xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale; xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale; v_float64 yf0 = v_lut(expTab, xi0 & vidxmask); v_float64 yf1 = v_lut(expTab, xi1 & vidxmask); v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047); xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047); xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047); v_int64 xq0, xq1, dummy; v_expand(xi0, xq0, dummy); v_expand(xi1, xq1, dummy); yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0)); yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1)); v_float64 zf0 = xf0 + vA1; v_float64 zf1 = xf1 + vA1; zf0 = v_fma(zf0, xf0, vA2); zf1 = v_fma(zf1, xf1, vA2); zf0 = v_fma(zf0, xf0, vA3); zf1 = v_fma(zf1, xf1, vA3); zf0 = v_fma(zf0, xf0, vA4); zf1 = v_fma(zf1, xf1, vA4); zf0 = v_fma(zf0, xf0, vA5); zf1 = v_fma(zf1, xf1, vA5); zf0 *= yf0; zf1 *= yf1; if( y_aligned ) { v_store_aligned(y + i, zf0); v_store_aligned(y + i + VECSZ, zf1); } else { v_store(y + i, zf0); v_store(y + i + VECSZ, zf1); } } vx_cleanup(); #endif for( ; i < n; i++ ) { double x0 = x[i].f; x0 = std::min(std::max(x0, minval), maxval); x0 *= exp_prescale; Cv64suf buf; int xi = saturate_cast<int>(x0); x0 = (x0 - xi)*exp_postscale; int t = (xi >> EXPTAB_SCALE) + 1023; t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; buf.i = (int64)t << 52; y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5); } }
void exp32f( const float *_x, float *y, int n ) { CV_INSTRUMENT_REGION(); const float* const expTab_f = cv::details::getExpTab32f(); const float A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0), A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0), A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0), A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0); int i = 0; const Cv32suf* x = (const Cv32suf*)_x; float minval = (float)(-exp_max_val/exp_prescale); float maxval = (float)(exp_max_val/exp_prescale); float postscale = (float)exp_postscale; #if CV_SIMD const int VECSZ = v_float32::nlanes; const v_float32 vprescale = vx_setall_f32((float)exp_prescale); const v_float32 vpostscale = vx_setall_f32((float)exp_postscale); const v_float32 vminval = vx_setall_f32(minval); const v_float32 vmaxval = vx_setall_f32(maxval); const v_float32 vA1 = vx_setall_f32((float)A1); const v_float32 vA2 = vx_setall_f32((float)A2); const v_float32 vA3 = vx_setall_f32((float)A3); const v_float32 vA4 = vx_setall_f32((float)A4); const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); bool y_aligned = (size_t)(void*)y % 32 == 0; for( ; i < n; i += VECSZ*2 ) { if( i + VECSZ*2 > n ) { if( i == 0 || _x == y ) break; i = n - VECSZ*2; y_aligned = false; } v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f); xf0 = v_min(v_max(xf0, vminval), vmaxval); xf1 = v_min(v_max(xf1, vminval), vmaxval); xf0 *= vprescale; xf1 *= vprescale; v_int32 xi0 = v_round(xf0); v_int32 xi1 = v_round(xf1); xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale; xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale; v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask); v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask); v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255); xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255); xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255); yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0)); yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1)); v_float32 zf0 = xf0 + vA1; v_float32 zf1 = xf1 + vA1; zf0 = v_fma(zf0, xf0, vA2); zf1 = v_fma(zf1, xf1, vA2); zf0 = v_fma(zf0, xf0, vA3); zf1 = v_fma(zf1, xf1, vA3); zf0 = v_fma(zf0, xf0, vA4); zf1 = v_fma(zf1, xf1, vA4); zf0 *= yf0; zf1 *= yf1; if( y_aligned ) { v_store_aligned(y + i, zf0); v_store_aligned(y + i + VECSZ, zf1); } else { v_store(y + i, zf0); v_store(y + i + VECSZ, zf1); } } vx_cleanup(); #endif for( ; i < n; i++ ) { float x0 = x[i].f; x0 = std::min(std::max(x0, minval), maxval); x0 *= (float)exp_prescale; Cv32suf buf; int xi = saturate_cast<int>(x0); x0 = (x0 - xi)*postscale; int t = (xi >> EXPTAB_SCALE) + 127; t = !(t & ~255) ? t : t < 0 ? 0 : 255; buf.i = t << 23; y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4); } }