Beispiel #1
0
void acc_simd_(const unsigned char* src, float* dst, const unsigned char* mask, int len)
{
    int x = 0;
    const int cVectorWidth = 16;

            for ( ; x <= len - cVectorWidth; x += cVectorWidth)
            {
                vector unsigned char v_mask = vec_xl(0, mask + x);
                v_mask = (vector unsigned char)vec_cmpeq(vec_splats((unsigned char)0), v_mask);
                v_mask = (vector unsigned char)vec_nor(v_mask, v_mask);
                vector unsigned char v_src0, v_src1, v_src2;
                v_load_deinterleave_u8((unsigned char *)(src + (x * 3)), &v_src0, &v_src1, &v_src2);
                v_src0 = v_src0 & v_mask;
                v_src1 = v_src1 & v_mask;
                v_src2 = v_src2 & v_mask;

                /* expand 16 uchar to 4 vectors which contains 4 uint */
                vector unsigned short v_src00, v_src01, v_src10, v_src11, v_src20, v_src21;
                v_expand_u8(&v_src0, &v_src00, &v_src01);
                v_expand_u8(&v_src1, &v_src10, &v_src11);
                v_expand_u8(&v_src2, &v_src20, &v_src21);
                vector unsigned int v_src000, v_src001, v_src010, v_src011;
                vector unsigned int v_src100, v_src101, v_src110, v_src111;
                vector unsigned int v_src200, v_src201, v_src210, v_src211;
                v_expand_u16(&v_src00, &v_src000, &v_src001);
                v_expand_u16(&v_src01, &v_src010, &v_src011);
                v_expand_u16(&v_src10, &v_src100, &v_src101);
                v_expand_u16(&v_src11, &v_src110, &v_src111);
                v_expand_u16(&v_src20, &v_src200, &v_src201);
                v_expand_u16(&v_src21, &v_src210, &v_src211);

                vector float v_dst000, v_dst001, v_dst010, v_dst011;
                vector float v_dst100, v_dst101, v_dst110, v_dst111;
                vector float v_dst200, v_dst201, v_dst210, v_dst211;
                v_load_deinterleave_f32(dst + (x * 3),        &v_dst000, &v_dst100, &v_dst200);
                v_load_deinterleave_f32(dst + ((x + 4) * 3),  &v_dst001, &v_dst101, &v_dst201);
                v_load_deinterleave_f32(dst + ((x + 8) * 3),  &v_dst010, &v_dst110, &v_dst210);
                v_load_deinterleave_f32(dst + ((x + 12) * 3), &v_dst011, &v_dst111, &v_dst211);

                v_store_interleave_f32(dst + (x * 3),        vec_add(v_dst000, v_cvt_f32(v_src000)), vec_add(v_dst100, v_cvt_f32(v_src100)), vec_add(v_dst200, v_cvt_f32(v_src200)));
                v_store_interleave_f32(dst + ((x + 4) * 3),  vec_add(v_dst001, v_cvt_f32(v_src001)), vec_add(v_dst101, v_cvt_f32(v_src101)), vec_add(v_dst201, v_cvt_f32(v_src201)));
                v_store_interleave_f32(dst + ((x + 8) * 3),  vec_add(v_dst010, v_cvt_f32(v_src010)), vec_add(v_dst110, v_cvt_f32(v_src110)), vec_add(v_dst210, v_cvt_f32(v_src210)));
                v_store_interleave_f32(dst + ((x + 12) * 3), vec_add(v_dst011, v_cvt_f32(v_src011)), vec_add(v_dst111, v_cvt_f32(v_src111)), vec_add(v_dst211, v_cvt_f32(v_src211)));
            }
    return;
}
Beispiel #2
0
 TheTest & test_float_cvt32()
 {
     typedef v_float32x4 Rt;
     Data<R> dataA;
     dataA *= 1.1;
     R a = dataA;
     Rt b = v_cvt_f32(a);
     Data<Rt> resB = b;
     int n = std::min<int>(Rt::nlanes, R::nlanes);
     for (int i = 0; i < n; ++i)
     {
         EXPECT_EQ((typename Rt::lane_type)dataA[i], resB[i]);
     }
     return *this;
 }
Beispiel #3
0
    TheTest & test_float_cvt_fp16()
    {
#if CV_FP16 && CV_SIMD128
        AlignedData<v_float32x4> data;

        if(1 /* checkHardwareSupport(CV_CPU_FP16) */)
        {
            // check conversion
            v_float32x4 r1 = v_load(data.a.d);
            v_float16x4 r2 = v_cvt_f16(r1);
            v_float32x4 r3 = v_cvt_f32(r2);
            EXPECT_EQ(0x3c00, r2.get0());
            EXPECT_EQ(r3.get0(), r1.get0());
        }

        return *this;
#endif
    }
Beispiel #4
0
void log32f( const float *_x, float *y, int n )
{
    CV_INSTRUMENT_REGION();

    const float* const logTab_f = cv::details::getLogTab32f();

    const int LOGTAB_MASK2_32F = (1 << (23 - LOGTAB_SCALE)) - 1;
    const float
    A0 = 0.3333333333333333333333333f,
    A1 = -0.5f,
    A2 = 1.f;

    int i = 0;
    const int* x = (const int*)_x;

#if CV_SIMD
    const int VECSZ = v_float32::nlanes;
    const v_float32 vln2 = vx_setall_f32((float)ln_2);
    const v_float32 v1 = vx_setall_f32(1.f);
    const v_float32 vshift = vx_setall_f32(-1.f/512);

    const v_float32 vA0 = vx_setall_f32(A0);
    const v_float32 vA1 = vx_setall_f32(A1);
    const v_float32 vA2 = vx_setall_f32(A2);

    for( ; i < n; i += VECSZ )
    {
        if( i + VECSZ > n )
        {
            if( i == 0 || _x == y )
                break;
            i = n - VECSZ;
        }

        v_int32 h0 = vx_load(x + i);
        v_int32 yi0 = (v_shr<23>(h0) & vx_setall_s32(255)) - vx_setall_s32(127);
        v_int32 xi0 = (h0 & vx_setall_s32(LOGTAB_MASK2_32F)) | vx_setall_s32(127 << 23);

        h0 = v_shr<23 - LOGTAB_SCALE - 1>(h0) & vx_setall_s32(LOGTAB_MASK*2);
        v_float32 yf0, xf0;

        v_lut_deinterleave(logTab_f, h0, yf0, xf0);

        yf0 = v_fma(v_cvt_f32(yi0), vln2, yf0);

        v_float32 delta = v_reinterpret_as_f32(h0 == vx_setall_s32(510)) & vshift;
        xf0 = v_fma((v_reinterpret_as_f32(xi0) - v1), xf0, delta);

        v_float32 zf0 = v_fma(xf0, vA0, vA1);
        zf0 = v_fma(zf0, xf0, vA2);
        zf0 = v_fma(zf0, xf0, yf0);

        v_store(y + i, zf0);
    }
    vx_cleanup();
#endif

    for( ; i < n; i++ )
    {
        Cv32suf buf;
        int i0 = x[i];

        buf.i = (i0 & LOGTAB_MASK2_32F) | (127 << 23);
        int idx = (i0 >> (23 - LOGTAB_SCALE - 1)) & (LOGTAB_MASK*2);

        float y0 = (((i0 >> 23) & 0xff) - 127) * (float)ln_2 + logTab_f[idx];
        float x0 = (buf.f - 1.f)*logTab_f[idx + 1] + (idx == 510 ? -1.f/512 : 0.f);
        y[i] = ((A0*x0 + A1)*x0 + A2)*x0 + y0;
    }
}
Beispiel #5
0
void exp32f( const float *_x, float *y, int n )
{
    CV_INSTRUMENT_REGION();

    const float* const expTab_f = cv::details::getExpTab32f();

    const float
    A4 = (float)(1.000000000000002438532970795181890933776 / EXPPOLY_32F_A0),
    A3 = (float)(.6931471805521448196800669615864773144641 / EXPPOLY_32F_A0),
    A2 = (float)(.2402265109513301490103372422686535526573 / EXPPOLY_32F_A0),
    A1 = (float)(.5550339366753125211915322047004666939128e-1 / EXPPOLY_32F_A0);

    int i = 0;
    const Cv32suf* x = (const Cv32suf*)_x;
    float minval = (float)(-exp_max_val/exp_prescale);
    float maxval = (float)(exp_max_val/exp_prescale);
    float postscale = (float)exp_postscale;

#if CV_SIMD
    const int VECSZ = v_float32::nlanes;
    const v_float32 vprescale = vx_setall_f32((float)exp_prescale);
    const v_float32 vpostscale = vx_setall_f32((float)exp_postscale);
    const v_float32 vminval = vx_setall_f32(minval);
    const v_float32 vmaxval = vx_setall_f32(maxval);

    const v_float32 vA1 = vx_setall_f32((float)A1);
    const v_float32 vA2 = vx_setall_f32((float)A2);
    const v_float32 vA3 = vx_setall_f32((float)A3);
    const v_float32 vA4 = vx_setall_f32((float)A4);

    const v_int32 vidxmask = vx_setall_s32(EXPTAB_MASK);
    bool y_aligned = (size_t)(void*)y % 32 == 0;

    for( ; i < n; i += VECSZ*2 )
    {
        if( i + VECSZ*2 > n )
        {
            if( i == 0 || _x == y )
                break;
            i = n - VECSZ*2;
            y_aligned = false;
        }

        v_float32 xf0 = vx_load(&x[i].f), xf1 = vx_load(&x[i + VECSZ].f);

        xf0 = v_min(v_max(xf0, vminval), vmaxval);
        xf1 = v_min(v_max(xf1, vminval), vmaxval);

        xf0 *= vprescale;
        xf1 *= vprescale;

        v_int32 xi0 = v_round(xf0);
        v_int32 xi1 = v_round(xf1);
        xf0 = (xf0 - v_cvt_f32(xi0))*vpostscale;
        xf1 = (xf1 - v_cvt_f32(xi1))*vpostscale;

        v_float32 yf0 = v_lut(expTab_f, xi0 & vidxmask);
        v_float32 yf1 = v_lut(expTab_f, xi1 & vidxmask);

        v_int32 v0 = vx_setzero_s32(), v127 = vx_setall_s32(127), v255 = vx_setall_s32(255);
        xi0 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi0) + v127, v0), v255);
        xi1 = v_min(v_max(v_shr<EXPTAB_SCALE>(xi1) + v127, v0), v255);

        yf0 *= v_reinterpret_as_f32(v_shl<23>(xi0));
        yf1 *= v_reinterpret_as_f32(v_shl<23>(xi1));

        v_float32 zf0 = xf0 + vA1;
        v_float32 zf1 = xf1 + vA1;

        zf0 = v_fma(zf0, xf0, vA2);
        zf1 = v_fma(zf1, xf1, vA2);

        zf0 = v_fma(zf0, xf0, vA3);
        zf1 = v_fma(zf1, xf1, vA3);

        zf0 = v_fma(zf0, xf0, vA4);
        zf1 = v_fma(zf1, xf1, vA4);

        zf0 *= yf0;
        zf1 *= yf1;

        if( y_aligned )
        {
            v_store_aligned(y + i, zf0);
            v_store_aligned(y + i + VECSZ, zf1);
        }
        else
        {
            v_store(y + i, zf0);
            v_store(y + i + VECSZ, zf1);
        }
    }
    vx_cleanup();
#endif

    for( ; i < n; i++ )
    {
        float x0 = x[i].f;
        x0 = std::min(std::max(x0, minval), maxval);
        x0 *= (float)exp_prescale;
        Cv32suf buf;

        int xi = saturate_cast<int>(x0);
        x0 = (x0 - xi)*postscale;

        int t = (xi >> EXPTAB_SCALE) + 127;
        t = !(t & ~255) ? t : t < 0 ? 0 : 255;
        buf.i = t << 23;

        y[i] = buf.f * expTab_f[xi & EXPTAB_MASK] * ((((x0 + A1)*x0 + A2)*x0 + A3)*x0 + A4);
    }
}