void ORDERED_PARTIAL_CONVOLVE_SIMD(int n, float * a, float const * b) { int i; float ab0; v4sf * RESTRICT va = (v4sf *)a; v4sf const * RESTRICT vb = (v4sf const *)b; assert(VALIGNED(a) && VALIGNED(b)); ab0 = a[0] * b[0]; for (i = 0; i < n / 4; i += 2) { v4sf a1r = va[i+0], a1i = va[i+1]; v4sf b1r = vb[i+0], b1i = vb[i+1]; UNINTERLEAVE2(a1r, a1i, a1r, a1i); UNINTERLEAVE2(b1r, b1i, b1r, b1i); VCPLXMUL(a1r, a1i, b1r, b1i); INTERLEAVE2(a1r, a1i, a1r, a1i); va[i+0] = a1r, va[i+1] = a1i; } a[0] = ab0; a[1] = b[n] * a[n] - b[n+1] * a[n+1]; }
void ORDERED_CONVOLVE_SIMD(int n, void * not_used, float * a, float const * b) { int i; float ab0, ab1; v4sf * RESTRICT va = (v4sf *)a; v4sf const * RESTRICT vb = (v4sf const *)b; assert(VALIGNED(a) && VALIGNED(b)); ab0 = a[0] * b[0], ab1 = a[1] * b[1]; for (i = 0; i < n / 4; i += 2) { v4sf a1r = va[i+0], a1i = va[i+1]; v4sf b1r = vb[i+0], b1i = vb[i+1]; UNINTERLEAVE2(a1r, a1i, a1r, a1i); UNINTERLEAVE2(b1r, b1i, b1r, b1i); VCPLXMUL(a1r, a1i, b1r, b1i); INTERLEAVE2(a1r, a1i, a1r, a1i); va[i+0] = a1r, va[i+1] = a1i; } a[0] = ab0, a[1] = ab1; (void)not_used; }
/* ---------------------------------------------------------------- */ void VL_XCAT3(_vl_imconvcol_v, SFX, _sse2) (T* dst, int dst_stride, T const* src, int src_width, int src_height, int src_stride, T const* filt, int filt_begin, int filt_end, int step, unsigned int flags) { int x = 0 ; int y ; int dheight = (src_height - 1) / step + 1 ; vl_bool use_simd = VALIGNED(src_stride) ; vl_bool transp = flags & VL_TRANSPOSE ; vl_bool zeropad = (flags & VL_PAD_MASK) == VL_PAD_BY_ZERO ; double totcol = 0 ; double simdcol = 0 ; /* let filt point to the last sample of the filter */ filt += filt_end - filt_begin ; while (x < src_width) { /* Calculate dest[x,y] = sum_p image[x,p] filt[y - p] * where supp(filt) = [filt_begin, filt_end] = [fb,fe]. * * CHUNK_A: y - fe <= p < 0 * completes VL_MAX(fe - y, 0) samples * CHUNK_B: VL_MAX(y - fe, 0) <= p < VL_MIN(y - fb, height - 1) * completes fe - VL_MAX(fb, height - y) + 1 samples * CHUNK_C: completes all samples */ T const *filti ; int stop ; if ((x + VSIZE < src_width) & VALIGNED(src + x) & use_simd) { /* ---------------------------------------------- Vectorized */ for (y = 0 ; y < src_height ; y += step) { union { VTYPE v ; T x [VSIZE] ; } acc ; VTYPE v, c ; T const *srci ; acc.v = VSTZ () ; v = VSTZ() ; filti = filt ; stop = filt_end - y ; srci = src + x - stop * src_stride ; if (stop > 0) { if (zeropad) { v = VSTZ () ; } else { v = * (VTYPE*) (src + x) ; } while (filti > filt - stop) { c = VLD1 (filti--) ; acc.v = VADD (acc.v, VMUL (v, c)) ; srci += src_stride ; } } stop = filt_end - VL_MAX(filt_begin, y - src_height + 1) + 1 ; while (filti > filt - stop) { v = * (VTYPE*) srci ; c = VLD1 (filti--) ; acc.v = VADD (acc.v, VMUL (v, c)) ; srci += src_stride ; } if (zeropad) v = VSTZ () ; stop = filt_end - filt_begin + 1; while (filti > filt - stop) { c = VLD1 (filti--) ; acc.v = VADD (acc.v, VMUL (v, c)) ; } if (transp) { *dst = acc.x[0] ; dst += dst_stride ; *dst = acc.x[1] ; dst += dst_stride ; #if(VSIZE == 4) *dst = acc.x[2] ; dst += dst_stride ; *dst = acc.x[3] ; dst += dst_stride ; #endif dst += 1 * 1 - VSIZE * dst_stride ; } else { *dst = acc.x[0] ; dst += 1 ; *dst = acc.x[1] ; dst += 1 ; #if(VSIZE == 4) *dst = acc.x[2] ; dst += 1 ; *dst = acc.x[3] ; dst += 1 ; #endif dst += 1 * dst_stride - VSIZE * 1 ; } } /* next y */ if (transp) { dst += VSIZE * dst_stride - dheight * 1 ; } else { dst += VSIZE * 1 - dheight * dst_stride ; } x += VSIZE ; simdcol += VSIZE ; totcol += VSIZE ; } else { /* ------------------------------------------------- Vanilla */ for (y = 0 ; y < src_height ; y += step) { T acc = 0 ; T v = 0, c ; T const* srci ; filti = filt ; stop = filt_end - y ; srci = src + x - stop * src_stride ; if (stop > 0) { if (zeropad) { v = 0 ; } else { v = *(src + x) ; } while (filti > filt - stop) { c = *filti-- ; acc += v * c ; srci += src_stride ; } } stop = filt_end - VL_MAX(filt_begin, y - src_height + 1) + 1 ; while (filti > filt - stop) { v = *srci ; c = *filti-- ; acc += v * c ; srci += src_stride ; } if (zeropad) v = 0 ; stop = filt_end - filt_begin + 1 ; while (filti > filt - stop) { c = *filti-- ; acc += v * c ; } if (transp) { *dst = acc ; dst += 1 ; } else { *dst = acc ; dst += dst_stride ; } } /* next y */ if (transp) { dst += 1 * dst_stride - dheight * 1 ; } else { dst += 1 * 1 - dheight * dst_stride ; } x += 1 ; totcol += 1 ; } /* next x */ } }