Ejemplo n.º 1
0
void ORDERED_PARTIAL_CONVOLVE_SIMD(int n, float * a, float const * b)
{
  int i;
  float ab0;
  v4sf       *   RESTRICT   va = (v4sf       *)a;
  v4sf const *   RESTRICT   vb = (v4sf const *)b;
  assert(VALIGNED(a) && VALIGNED(b));
  ab0 = a[0] * b[0];
  for (i = 0; i < n / 4; i += 2) {
    v4sf a1r = va[i+0], a1i = va[i+1];
    v4sf b1r = vb[i+0], b1i = vb[i+1];
    UNINTERLEAVE2(a1r, a1i, a1r, a1i);
    UNINTERLEAVE2(b1r, b1i, b1r, b1i);
    VCPLXMUL(a1r, a1i, b1r, b1i);
    INTERLEAVE2(a1r, a1i, a1r, a1i);
    va[i+0] = a1r, va[i+1] = a1i;
  }
  a[0] = ab0;
  a[1] = b[n] * a[n] - b[n+1] * a[n+1];
}
Ejemplo n.º 2
0
void ORDERED_CONVOLVE_SIMD(int n, void * not_used, float * a, float const * b)
{
  int i;
  float ab0, ab1;
  v4sf       *   RESTRICT   va = (v4sf       *)a;
  v4sf const *   RESTRICT   vb = (v4sf const *)b;
  assert(VALIGNED(a) && VALIGNED(b));
  ab0 = a[0] * b[0], ab1 = a[1] * b[1];
  for (i = 0; i < n / 4; i += 2) {
    v4sf a1r = va[i+0], a1i = va[i+1];
    v4sf b1r = vb[i+0], b1i = vb[i+1];
    UNINTERLEAVE2(a1r, a1i, a1r, a1i);
    UNINTERLEAVE2(b1r, b1i, b1r, b1i);
    VCPLXMUL(a1r, a1i, b1r, b1i);
    INTERLEAVE2(a1r, a1i, a1r, a1i);
    va[i+0] = a1r, va[i+1] = a1i;
  }
  a[0] = ab0, a[1] = ab1;
  (void)not_used;
}
Ejemplo n.º 3
0
/* ---------------------------------------------------------------- */
void
VL_XCAT3(_vl_imconvcol_v, SFX, _sse2)
(T* dst, int dst_stride,
 T const* src,
 int src_width, int src_height, int src_stride,
 T const* filt, int filt_begin, int filt_end,
 int step, unsigned int flags)
{
    int x = 0 ;
    int y ;
    int dheight = (src_height - 1) / step + 1 ;
    vl_bool use_simd  = VALIGNED(src_stride) ;
    vl_bool transp    = flags & VL_TRANSPOSE ;
    vl_bool zeropad   = (flags & VL_PAD_MASK) == VL_PAD_BY_ZERO ;
    double totcol = 0 ;
    double simdcol = 0 ;

    /* let filt point to the last sample of the filter */
    filt += filt_end - filt_begin ;

    while (x < src_width) {
        /* Calculate dest[x,y] = sum_p image[x,p] filt[y - p]
         * where supp(filt) = [filt_begin, filt_end] = [fb,fe].
         *
         * CHUNK_A: y - fe <= p < 0
         *          completes VL_MAX(fe - y, 0) samples
         * CHUNK_B: VL_MAX(y - fe, 0) <= p < VL_MIN(y - fb, height - 1)
         *          completes fe - VL_MAX(fb, height - y) + 1 samples
         * CHUNK_C: completes all samples
         */

        T const *filti ;
        int stop ;

        if ((x + VSIZE < src_width) & VALIGNED(src + x) & use_simd)
        {
            /* ----------------------------------------------  Vectorized */
            for (y = 0 ; y < src_height ; y += step)  {
                union {
                    VTYPE v ;
                    T x [VSIZE] ;
                } acc ;
                VTYPE v, c ;
                T const *srci ;
                acc.v = VSTZ () ;
                v = VSTZ() ;

                filti = filt ;
                stop = filt_end - y ;
                srci = src + x - stop * src_stride ;

                if (stop > 0) {
                    if (zeropad) {
                        v = VSTZ () ;
                    } else {
                        v = * (VTYPE*) (src + x) ;
                    }
                    while (filti > filt - stop) {
                        c = VLD1 (filti--) ;
                        acc.v = VADD (acc.v,  VMUL (v, c)) ;
                        srci += src_stride ;
                    }
                }

                stop = filt_end - VL_MAX(filt_begin, y - src_height + 1) + 1 ;
                while (filti > filt - stop) {
                    v = * (VTYPE*) srci ;
                    c = VLD1 (filti--) ;
                    acc.v = VADD (acc.v, VMUL (v, c)) ;
                    srci += src_stride ;
                }

                if (zeropad) v = VSTZ () ;

                stop = filt_end - filt_begin + 1;
                while (filti > filt - stop) {
                    c = VLD1 (filti--) ;
                    acc.v = VADD (acc.v, VMUL (v, c)) ;
                }

                if (transp) {
                    *dst = acc.x[0] ;
                    dst += dst_stride ;
                    *dst = acc.x[1] ;
                    dst += dst_stride ;
#if(VSIZE == 4)
                    *dst = acc.x[2] ;
                    dst += dst_stride ;
                    *dst = acc.x[3] ;
                    dst += dst_stride ;
#endif
                    dst += 1 * 1 - VSIZE * dst_stride ;
                } else {
                    *dst = acc.x[0] ;
                    dst += 1 ;
                    *dst = acc.x[1] ;
                    dst += 1 ;
#if(VSIZE == 4)
                    *dst = acc.x[2] ;
                    dst += 1 ;
                    *dst = acc.x[3] ;
                    dst += 1 ;
#endif
                    dst += 1 * dst_stride - VSIZE * 1 ;
                }
            } /* next y */
            if (transp) {
                dst += VSIZE * dst_stride - dheight * 1 ;
            } else {
                dst += VSIZE * 1 - dheight * dst_stride ;
            }
            x       += VSIZE ;
            simdcol += VSIZE ;
            totcol  += VSIZE ;
        } else {
            /* -------------------------------------------------  Vanilla */
            for (y = 0 ; y < src_height ; y += step) {
                T acc = 0 ;
                T v = 0, c ;
                T const* srci ;

                filti = filt ;
                stop = filt_end - y ;
                srci = src + x - stop * src_stride ;

                if (stop > 0) {
                    if (zeropad) {
                        v = 0 ;
                    } else {
                        v = *(src + x) ;
                    }
                    while (filti > filt - stop) {
                        c = *filti-- ;
                        acc += v * c ;
                        srci += src_stride ;
                    }
                }

                stop = filt_end - VL_MAX(filt_begin, y - src_height + 1) + 1 ;
                while (filti > filt - stop) {
                    v = *srci ;
                    c = *filti-- ;
                    acc += v * c ;
                    srci += src_stride ;
                }

                if (zeropad) v = 0 ;

                stop = filt_end - filt_begin + 1 ;
                while (filti > filt - stop) {
                    c = *filti-- ;
                    acc += v * c ;
                }

                if (transp) {
                    *dst = acc ;
                    dst += 1 ;
                } else {
                    *dst = acc ;
                    dst += dst_stride ;
                }
            } /* next y */
            if (transp) {
                dst += 1 * dst_stride - dheight * 1 ;
            } else {
                dst += 1 * 1 - dheight * dst_stride ;
            }
            x      += 1 ;
            totcol += 1 ;
        } /* next x */
    }
}