void sqrt64f(const double* src, double* dst, int len) { CV_INSTRUMENT_REGION(); int i = 0; #if CV_SIMD_64F const int VECSZ = v_float64::nlanes; for( ; i < len; i += VECSZ*2 ) { if( i + VECSZ*2 > len ) { if( i == 0 || src == dst ) break; i = len - VECSZ*2; } v_float64 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ); t0 = v_sqrt(t0); t1 = v_sqrt(t1); v_store(dst + i, t0); v_store(dst + i + VECSZ, t1); } vx_cleanup(); #endif for( ; i < len; i++ ) dst[i] = std::sqrt(src[i]); }
void invSqrt32f(const float* src, float* dst, int len) { CV_INSTRUMENT_REGION(); int i = 0; #if CV_SIMD const int VECSZ = v_float32::nlanes; for( ; i < len; i += VECSZ*2 ) { if( i + VECSZ*2 > len ) { if( i == 0 || src == dst ) break; i = len - VECSZ*2; } v_float32 t0 = vx_load(src + i), t1 = vx_load(src + i + VECSZ); t0 = v_invsqrt(t0); t1 = v_invsqrt(t1); v_store(dst + i, t0); v_store(dst + i + VECSZ, t1); } vx_cleanup(); #endif for( ; i < len; i++ ) dst[i] = 1/std::sqrt(src[i]); }
void magnitude64f(const double* x, const double* y, double* mag, int len) { CV_INSTRUMENT_REGION(); int i = 0; #if CV_SIMD_64F const int VECSZ = v_float64::nlanes; for( ; i < len; i += VECSZ*2 ) { if( i + VECSZ*2 > len ) { if( i == 0 || mag == x || mag == y ) break; i = len - VECSZ*2; } v_float64 x0 = vx_load(x + i), x1 = vx_load(x + i + VECSZ); v_float64 y0 = vx_load(y + i), y1 = vx_load(y + i + VECSZ); x0 = v_sqrt(v_muladd(x0, x0, y0*y0)); x1 = v_sqrt(v_muladd(x1, x1, y1*y1)); v_store(mag + i, x0); v_store(mag + i + VECSZ, x1); } vx_cleanup(); #endif for( ; i < len; i++ ) { double x0 = x[i], y0 = y[i]; mag[i] = std::sqrt(x0*x0 + y0*y0); } }
void log32f( const float *_x, float *y, int n ) { CV_INSTRUMENT_REGION(); const float* const logTab_f = cv::details::getLogTab32f(); const int LOGTAB_MASK2_32F = (1 << (23 - LOGTAB_SCALE)) - 1; const float A0 = 0.3333333333333333333333333f, A1 = -0.5f, A2 = 1.f; int i = 0; const int* x = (const int*)_x; #if CV_SIMD const int VECSZ = v_float32::nlanes; const v_float32 vln2 = vx_setall_f32((float)ln_2); const v_float32 v1 = vx_setall_f32(1.f); const v_float32 vshift = vx_setall_f32(-1.f/512); const v_float32 vA0 = vx_setall_f32(A0); const v_float32 vA1 = vx_setall_f32(A1); const v_float32 vA2 = vx_setall_f32(A2); for( ; i < n; i += VECSZ ) { if( i + VECSZ > n ) { if( i == 0 || _x == y ) break; i = n - VECSZ; } v_int32 h0 = vx_load(x + i); v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127); v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23); h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2); v_float32 yf0, xf0; v_lut_deinterleave(logTab_f, h0, yf0, xf0); yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0); v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift; xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta); v_float32 zf0 = v_fma(xf0, vA0, vA1); zf0 = v_fma(zf0, xf0, vA2); zf0 = v_fma(zf0, xf0, yf0); v_store(y + i, zf0); } vx_cleanup(); #endif for( ; i < n; i++ ) { Cv32suf buf; int i0 = x[i]; buf.i = (i0 & LOGTAB_MASK2_32F) | (127 << 23); int idx = (i0 >> (23 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2); float y0 = (((i0 >> 23) & 0xff) - 127) * (float)ln_2 + logTab_f[idx]; float x0 = (buf.f - 1.f)*logTab_f[idx + 1] + (idx == 510 ? -1.f/512 : 0.f); y[i] = ((A0*x0 + A1)*x0 + A2)*x0 + y0; } }
void exp64f( const double *_x, double *y, int n ) { CV_INSTRUMENT_REGION(); const double* const expTab = cv::details::getExpTab64f(); const double A5 = .99999999999999999998285227504999 / EXPPOLY_32F_A0, A4 = .69314718055994546743029643825322 / EXPPOLY_32F_A0, A3 = .24022650695886477918181338054308 / EXPPOLY_32F_A0, A2 = .55504108793649567998466049042729e-1 / EXPPOLY_32F_A0, A1 = .96180973140732918010002372686186e-2 / EXPPOLY_32F_A0, A0 = .13369713757180123244806654839424e-2 / EXPPOLY_32F_A0; int i = 0; const Cv64suf* x = (const Cv64suf*)_x; double minval = (-exp_max_val/exp_prescale); double maxval = (exp_max_val/exp_prescale); #if CV_SIMD_64F const int VECSZ = v_float64::nlanes; const v_float64 vprescale = vx_setall_f64(exp_prescale); const v_float64 vpostscale = vx_setall_f64(exp_postscale); const v_float64 vminval = vx_setall_f64(minval); const v_float64 vmaxval = vx_setall_f64(maxval); const v_float64 vA1 = vx_setall_f64(A1); const v_float64 vA2 = vx_setall_f64(A2); const v_float64 vA3 = vx_setall_f64(A3); const v_float64 vA4 = vx_setall_f64(A4); const v_float64 vA5 = vx_setall_f64(A5); const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); bool y_aligned = (size_t)(void*)y % 32 == 0; for( ; i < n; i += VECSZ*2 ) { if( i + VECSZ*2 > n ) { if( i == 0 || _x == y ) break; i = n - VECSZ*2; y_aligned = false; } v_float64 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f); xf0 = v_min(v_max(xf0, vminval), vmaxval); xf1 = v_min(v_max(xf1, vminval), vmaxval); xf0 *= vprescale; xf1 *= vprescale; v_int32 xi0 = v_round(xf0); v_int32 xi1 = v_round(xf1); xf0 = (xf0 - v_cvt_f64(xi0))*vpostscale; xf1 = (xf1 - v_cvt_f64(xi1))*vpostscale; v_float64 yf0 = v_lut(expTab, xi0 & vidxmask); v_float64 yf1 = v_lut(expTab, xi1 & vidxmask); v_int32 v0 = vx_setzero_s32(), v1023 = vx_setall_s32(1023), v2047 = vx_setall_s32(2047); xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v1023, v0), v2047); xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v1023, v0), v2047); v_int64 xq0, xq1, dummy; v_expand(xi0, xq0, dummy); v_expand(xi1, xq1, dummy); yf0 *= v_reinterpret_as_f64(v_shl<52>(xq0)); yf1 *= v_reinterpret_as_f64(v_shl<52>(xq1)); v_float64 zf0 = xf0 + vA1; v_float64 zf1 = xf1 + vA1; zf0 = v_fma(zf0, xf0, vA2); zf1 = v_fma(zf1, xf1, vA2); zf0 = v_fma(zf0, xf0, vA3); zf1 = v_fma(zf1, xf1, vA3); zf0 = v_fma(zf0, xf0, vA4); zf1 = v_fma(zf1, xf1, vA4); zf0 = v_fma(zf0, xf0, vA5); zf1 = v_fma(zf1, xf1, vA5); zf0 *= yf0; zf1 *= yf1; if( y_aligned ) { v_store_aligned(y + i, zf0); v_store_aligned(y + i + VECSZ, zf1); } else { v_store(y + i, zf0); v_store(y + i + VECSZ, zf1); } } vx_cleanup(); #endif for( ; i < n; i++ ) { double x0 = x[i].f; x0 = std::min(std::max(x0, minval), maxval); x0 *= exp_prescale; Cv64suf buf; int xi = saturate_cast<int>(x0); x0 = (x0 - xi)*exp_postscale; int t = (xi >> EXPTAB_SCALE) + 1023; t = !(t & ~2047) ? t : t < 0 ? 0 : 2047; buf.i = (int64)t << 52; y[i] = buf.f * expTab[xi & EXPTAB_MASK] * (((((A0*x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4)*x0 + A5); } }
void exp32f( const float *_x, float *y, int n ) { CV_INSTRUMENT_REGION(); const float* const expTab_f = cv::details::getExpTab32f(); const float A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0), A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0), A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0), A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0); int i = 0; const Cv32suf* x = (const Cv32suf*)_x; float minval = (float)(-exp_max_val/exp_prescale); float maxval = (float)(exp_max_val/exp_prescale); float postscale = (float)exp_postscale; #if CV_SIMD const int VECSZ = v_float32::nlanes; const v_float32 vprescale = vx_setall_f32((float)exp_prescale); const v_float32 vpostscale = vx_setall_f32((float)exp_postscale); const v_float32 vminval = vx_setall_f32(minval); const v_float32 vmaxval = vx_setall_f32(maxval); const v_float32 vA1 = vx_setall_f32((float)A1); const v_float32 vA2 = vx_setall_f32((float)A2); const v_float32 vA3 = vx_setall_f32((float)A3); const v_float32 vA4 = vx_setall_f32((float)A4); const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK); bool y_aligned = (size_t)(void*)y % 32 == 0; for( ; i < n; i += VECSZ*2 ) { if( i + VECSZ*2 > n ) { if( i == 0 || _x == y ) break; i = n - VECSZ*2; y_aligned = false; } v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f); xf0 = v_min(v_max(xf0, vminval), vmaxval); xf1 = v_min(v_max(xf1, vminval), vmaxval); xf0 *= vprescale; xf1 *= vprescale; v_int32 xi0 = v_round(xf0); v_int32 xi1 = v_round(xf1); xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale; xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale; v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask); v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask); v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255); xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255); xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255); yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0)); yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1)); v_float32 zf0 = xf0 + vA1; v_float32 zf1 = xf1 + vA1; zf0 = v_fma(zf0, xf0, vA2); zf1 = v_fma(zf1, xf1, vA2); zf0 = v_fma(zf0, xf0, vA3); zf1 = v_fma(zf1, xf1, vA3); zf0 = v_fma(zf0, xf0, vA4); zf1 = v_fma(zf1, xf1, vA4); zf0 *= yf0; zf1 *= yf1; if( y_aligned ) { v_store_aligned(y + i, zf0); v_store_aligned(y + i + VECSZ, zf1); } else { v_store(y + i, zf0); v_store(y + i + VECSZ, zf1); } } vx_cleanup(); #endif for( ; i < n; i++ ) { float x0 = x[i].f; x0 = std::min(std::max(x0, minval), maxval); x0 *= (float)exp_prescale; Cv32suf buf; int xi = saturate_cast<int>(x0); x0 = (x0 - xi)*postscale; int t = (xi >> EXPTAB_SCALE) + 127; t = !(t & ~255) ? t : t < 0 ? 0 : 255; buf.i = t << 23; y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4); } }
/* The trick with STORE_UNALIGNED/STORE_ALIGNED_NOCACHE is the following: on IA there are instructions movntps and such to which v_store_interleave(...., STORE_ALIGNED_NOCACHE) is mapped. Those instructions write directly into memory w/o touching cache that results in dramatic speed improvements, especially on large arrays (FullHD, 4K etc.). Those intrinsics require the destination address to be aligned by 16/32 bits (with SSE2 and AVX2, respectively). So we potentially split the processing into 3 stages: 1) the optional prefix part [0:i0), where we use simple unaligned stores. 2) the optional main part [i0:len - VECSZ], where we use "nocache" mode. But in some cases we have to use unaligned stores in this part. 3) the optional suffix part (the tail) (len - VECSZ:len) where we switch back to "unaligned" mode to process the remaining len - VECSZ elements. In principle there can be very poorly aligned data where there is no main part. For that we set i0=0 and use unaligned stores for the whole array. */ template<typename T, typename VecT> static void vecmerge_( const T** src, T* dst, int len, int cn ) { const int VECSZ = VecT::nlanes; int i, i0 = 0; const T* src0 = src[0]; const T* src1 = src[1]; const int dstElemSize = cn * sizeof(T); int r = (int)((size_t)(void*)dst % (VECSZ*sizeof(T))); hal::StoreMode mode = hal::STORE_ALIGNED_NOCACHE; if( r != 0 ) { mode = hal::STORE_UNALIGNED; if (r % dstElemSize == 0 && len > VECSZ*2) i0 = VECSZ - (r / dstElemSize); } if( cn == 2 ) { for( i = 0; i < len; i += VECSZ ) { if( i > len - VECSZ ) { i = len - VECSZ; mode = hal::STORE_UNALIGNED; } VecT a = vx_load(src0 + i), b = vx_load(src1 + i); v_store_interleave(dst + i*cn, a, b, mode); if( i < i0 ) { i = i0 - VECSZ; mode = hal::STORE_ALIGNED_NOCACHE; } } } else if( cn == 3 ) { const T* src2 = src[2]; for( i = 0; i < len; i += VECSZ ) { if( i > len - VECSZ ) { i = len - VECSZ; mode = hal::STORE_UNALIGNED; } VecT a = vx_load(src0 + i), b = vx_load(src1 + i), c = vx_load(src2 + i); v_store_interleave(dst + i*cn, a, b, c, mode); if( i < i0 ) { i = i0 - VECSZ; mode = hal::STORE_ALIGNED_NOCACHE; } } } else { CV_Assert( cn == 4 ); const T* src2 = src[2]; const T* src3 = src[3]; for( i = 0; i < len; i += VECSZ ) { if( i > len - VECSZ ) { i = len - VECSZ; mode = hal::STORE_UNALIGNED; } VecT a = vx_load(src0 + i), b = vx_load(src1 + i); VecT c = vx_load(src2 + i), d = vx_load(src3 + i); v_store_interleave(dst + i*cn, a, b, c, d, mode); if( i < i0 ) { i = i0 - VECSZ; mode = hal::STORE_ALIGNED_NOCACHE; } } } vx_cleanup(); }
int main (int argc, char *argv[]) { vx_entry_t entry; char modeldir[CMLEN]; vx_zmode_t zmode; int use_gtl = True; int use_scec = False; int opt; zmode = VX_ZMODE_ELEVOFF; strcpy(modeldir, "."); /* Parse options */ while ((opt = getopt(argc, argv, "gm:sz:h")) != -1) { switch (opt) { case 'g': use_gtl = False; break; case 'm': strcpy(modeldir, optarg); break; case 's': use_scec = True; break; case 'z': if (strcasecmp(optarg, "dep") == 0) { zmode = VX_ZMODE_DEPTH; } else if (strcasecmp(optarg, "elev") == 0) { zmode = VX_ZMODE_ELEV; } else if (strcasecmp(optarg, "off") == 0) { zmode = VX_ZMODE_ELEVOFF; } else { fprintf(stderr, "Invalid coord type %s", optarg); usage(); exit(0); } break; case 'h': usage(); exit(0); break; default: /* '?' */ usage(); exit(1); } } /* Perform setup */ if (vx_setup(modeldir) != 0) { fprintf(stderr, "Failed to init vx\n"); exit(1); } /* Register SCEC 1D background model */ if (use_scec) { vx_register_scec(); } /* Set GTL */ vx_setgtl(use_gtl); /* Set zmode */ vx_setzmode(zmode); /* now let's start with searching .... */ while (!feof(stdin)) { if (fscanf(stdin,"%lf %lf %lf", &entry.coor[0],&entry.coor[1],&entry.coor[2]) == 3) { if (entry.coor[1]<10000000) { printf("%14.6f %15.6f %9.2f ", entry.coor[0], entry.coor[1], entry.coor[2]); } /* In case we got anything like degrees */ if ((entry.coor[0]<360.) && (fabs(entry.coor[1])<90)) { entry.coor_type = VX_COORD_GEO; } else { entry.coor_type = VX_COORD_UTM; } /* Query the point */ vx_getcoord(&entry); /*** Prevent all to obvious bad coordinates from being displayed */ if (entry.coor[1]<10000000) { //printf("%14.6f %15.6f %9.2f ", // entry.coor[0], entry.coor[1], entry.coor[2]); /* AP: Let's provide the computed UTM coordinates as well */ printf("%10.2f %11.2f ", entry.coor_utm[0], entry.coor_utm[1]); printf("%10.2f %11.2f ", entry.elev_cell[0], entry.elev_cell[1]); printf("%9.2f ", entry.topo); printf("%9.2f ", entry.mtop); printf("%9.2f ", entry.base); printf("%9.2f ", entry.moho); printf("%s %10.2f %11.2f %9.2f ", VX_SRC_NAMES[entry.data_src], entry.vel_cell[0], entry.vel_cell[1], entry.vel_cell[2]); printf("%9.2f %9.2f %9.2f ", entry.provenance, entry.vp, entry.vs); printf("%9.2f\n", entry.rho); } } } /* Perform cleanup */ vx_cleanup(); return 0; }