template<typename _Ts, typename _Td> inline void cvt_64f( const _Ts* src, size_t sstep, _Td* dst, size_t dstep, Size size, double a, double b ) { #if CV_SIMD_64F v_float64 va = vx_setall_f64(a), vb = vx_setall_f64(b); const int VECSZ = v_float64::nlanes*2; #endif sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); for( int i = 0; i < size.height; i++, src += sstep, dst += dstep ) { int j = 0; #if CV_SIMD_64F for( ; j < size.width; j += VECSZ ) { if( j > size.width - VECSZ ) { if( j == 0 || src == (_Ts*)dst ) break; j = size.width - VECSZ; } v_float64 v0, v1; vx_load_pair_as(src + j, v0, v1); v0 = v_fma(v0, va, vb); v1 = v_fma(v1, va, vb); v_store_pair_as(dst + j, v0, v1); } #endif for( ; j < size.width; j++ ) dst[j] = saturate_cast<_Td>(src[j]*a + b); } }
void log64f( const double *x, double *y, int n ) { CV_INSTRUMENT_REGION(); const double* const logTab = cv::details::getLogTab64f(); const int64 LOGTAB_MASK2_64F = ((int64)1 << (52 - LOGTAB_SCALE)) - 1; const double A7 = 1.0, A6 = -0.5, A5 = 0.333333333333333314829616256247390992939472198486328125, A4 = -0.25, A3 = 0.2, A2 = -0.1666666666666666574148081281236954964697360992431640625, A1 = 0.1428571428571428769682682968777953647077083587646484375, A0 = -0.125; int i = 0; #if CV_SIMD_64F const int VECSZ = v_float64::nlanes; const v_float64 vln2 = vx_setall_f64(ln_2); const v_float64 vA0 = vx_setall_f64(A0), vA1 = vx_setall_f64(A1), vA2 = vx_setall_f64(A2), vA3 = vx_setall_f64(A3), vA4 = vx_setall_f64(A4), vA5 = vx_setall_f64(A5), vA6 = vx_setall_f64(A6), vA7 = vx_setall_f64(A7); for( ; i < n; i += VECSZ ) { if( i + VECSZ > n ) { if( i == 0 || x == y ) break; i = n - VECSZ; } v_int64 h0 = vx_load((const int64*)x + i); v_int32 yi0 = v_pack(v_shr<52>(h0), vx_setzero_s64()); yi0 = (yi0 & vx_setall_s32(0x7ff)) - vx_setall_s32(1023); v_int64 xi0 = (h0 & vx_setall_s64(LOGTAB_MASK2_64F)) | vx_setall_s64((int64)1023 << 52); h0 = v_shr<52 - LOGTAB_SCALE - 1>(h0); v_int32 idx = v_pack(h0, h0) & vx_setall_s32(LOGTAB_MASK*2); v_float64 xf0, yf0; v_lut_deinterleave(logTab, idx, yf0, xf0); yf0 = v_fma(v_cvt_f64(yi0), vln2, yf0); v_float64 delta = v_cvt_f64(idx == vx_setall_s32(510))*vx_setall_f64(1./512); xf0 = v_fma(v_reinterpret_as_f64(xi0) - vx_setall_f64(1.), xf0, delta); v_float64 xq = xf0*xf0; v_float64 zf0 = v_fma(xq, vA0, vA2); v_float64 zf1 = v_fma(xq, vA1, vA3); zf0 = v_fma(zf0, xq, vA4); zf1 = v_fma(zf1, xq, vA5); zf0 = v_fma(zf0, xq, vA6); zf1 = v_fma(zf1, xq, vA7); zf1 = v_fma(zf1, xf0, yf0); zf0 = v_fma(zf0, xq, zf1); v_store(y + i, zf0); } #endif for( ; i < n; i++ ) { Cv64suf buf; int64 i0 = ((const int64*)x)[i]; buf.i = (i0 & LOGTAB_MASK2_64F) | ((int64)1023 << 52); int idx = (int)(i0 >> (52 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2); double y0 = (((int)(i0 >> 52) & 0x7ff) - 1023) * ln_2 + logTab[idx]; double x0 = (buf.f - 1.)*logTab[idx + 1] + (idx == 510 ? -1./512 : 0.); double xq = x0*x0; y[i] = (((A0*xq + A2)*xq + A4)*xq + A6)*xq + (((A1*xq + A3)*xq + A5)*xq + A7)*x0 + y0; } }
void exp64f( const double *_x, double *y, int n ) { CV_INSTRUMENT_REGION(); const double* const expTab = cv::details::getExpTab64f(); const double A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0, A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0, A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0, A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0; int i = 0; const Cv64suf* x = (const Cv64suf*)_x; double minval = (-exp_max_val/exp_prescale); double maxval = (exp_max_val/exp_prescale); #if CV_SIMD_64F const int VECSZ = v_float64::nlanes; const v_float64 vprescale = vx_setall_f64(exp_prescale); const v_float64 vpostscale = vx_setall_f64(exp_postscale); const v_float64 vminval = vx_setall_f64(minval); const v_float64 vmaxval = vx_setall_f64(maxval); const v_float64 vA1 = vx_setall_f64(A1); const v_float64 vA2 = vx_setall_f64(A2); const v_float64 vA3 = vx_setall_f64(A3); const v_float64 vA4 = vx_setall_f64(A4); const v_float64 vA5 = vx_setall_f64(A5); const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); bool y_aligned = (size_t)(void*)y % 32 == 0; for( ; i < n; i += VECSZ*2 ) { if( i + VECSZ*2 > n ) { if( i == 0 || _x == y ) break; i = n - VECSZ*2; y_aligned = false; } v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f); xf0 = v_min(v_max(xf0, vminval), vmaxval); xf1 = v_min(v_max(xf1, vminval), vmaxval); xf0 *= vprescale; xf1 *= vprescale; v_int32 xi0 = v_round(xf0); v_int32 xi1 = v_round(xf1); xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale; xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale; v_float64 yf0 = v_lut(expTab, xi0 & vidxmask); v_float64 yf1 = v_lut(expTab, xi1 & vidxmask); v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047); xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047); xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047); v_int64 xq0, xq1, dummy; v_expand(xi0, xq0, dummy); v_expand(xi1, xq1, dummy); yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0)); yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1)); v_float64 zf0 = xf0 + vA1; v_float64 zf1 = xf1 + vA1; zf0 = v_fma(zf0, xf0, vA2); zf1 = v_fma(zf1, xf1, vA2); zf0 = v_fma(zf0, xf0, vA3); zf1 = v_fma(zf1, xf1, vA3); zf0 = v_fma(zf0, xf0, vA4); zf1 = v_fma(zf1, xf1, vA4); zf0 = v_fma(zf0, xf0, vA5); zf1 = v_fma(zf1, xf1, vA5); zf0 *= yf0; zf1 *= yf1; if( y_aligned ) { v_store_aligned(y + i, zf0); v_store_aligned(y + i + VECSZ, zf1); } else { v_store(y + i, zf0); v_store(y + i + VECSZ, zf1); } } vx_cleanup(); #endif for( ; i < n; i++ ) { double x0 = x[i].f; x0 = std::min(std::max(x0, minval), maxval); x0 *= exp_prescale; Cv64suf buf; int xi = saturate_cast<int>(x0); x0 = (x0 - xi)*exp_postscale; int t = (xi >> EXPTAB_SCALE) + 1023; t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; buf.i = (int64)t << 52; y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5); } }