mlib_status mlib_m_sconv3x3_16nw_1( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, vker0, vker1, vker2; __m64 s0, s1, s2, v0, v1, aa, bb, rr, rh, rl; __m64 *sp0, *sp1, *sp2, *dp; __m64 zero, _rnd; mlib_s32 shift, kerh_sum; mlib_s32 i, j; width -= 2; height -= 2; width *= NCHAN; dl += dll + NCHAN; GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); dp = (__m64 *) dl; PREP_V(); for (i = 0; i < width / 4; i++) { CONV_3x3(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_3x3(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
mlib_status mlib_m_sconv5x5_u16nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, hker3, hker4; __m64 vker0, vker1, vker2, vker3, vker4; __m64 s0, s1, s2, s3, s4, v0, v1, v2, v3, v4, rr, rh, rl; __m64 zero, ker_off, mask8000; __m64 *sp0, *sp1, *sp2, *sp3, *sp4, *dp; mlib_s32 shift, ker_sum, kerh_sum = 0, kerv_sum = 0; mlib_s32 i, j; width -= 4; height -= 4; dl += 2 * (dll + NCHAN); GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); sp3 = (__m64 *) (sl + 3 * sll); sp4 = (__m64 *) (sl + 4 * sll); dp = (__m64 *) dl; PREP_V(); for (i = 0; i < width; i++) { CONV_5x5(); dp[i] = rr; } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
mlib_status mlib_m_sconv7x7_16nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, hker3, hker4, hker5, hker6; __m64 vker0, vker1, vker2, vker3, vker4, vker5, vker6; __m64 s0, s1, s2, s3, s4, s5, s6, v0, v1, v2, v3, v4, v5, v6, rr, rh, rl; __m64 zero, _rnd; __m64 *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *dp; mlib_s32 shift, kerh_sum; mlib_s32 i, j; width -= KSIZE1; height -= KSIZE1; width *= NCHAN; dl += (KSIZE / 2) * (dll + NCHAN); GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); sp3 = (__m64 *) (sl + 3 * sll); sp4 = (__m64 *) (sl + 4 * sll); sp5 = (__m64 *) (sl + 5 * sll); sp6 = (__m64 *) (sl + 6 * sll); dp = (__m64 *) dl; PREP_V(v1); PREP_V(v2); PREP_V(v3); PREP_V(v4); PREP_V(v5); PREP_V(v6); for (i = 0; i < width / 4; i++) { CONV_7x7(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_7x7(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
/* *********************************************************** */ mlib_status mlib_m_sconv3x3_8nw_1( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { __m64 buff_loc[3 * BUFF_LINE], *pbuff = buff_loc; __m64 *buff0, *buff1, *buffT; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 hker0, hker1, hker2, vker0, vker1, vker2; __m64 s0, d0, d1, sum0, sum1, sum2, aa, bb, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= 2; height -= 2; dl += dll + 1; wid4 = (width + 7) / 4; if (wid4 > BUFF_LINE) { pbuff = mlib_malloc(sizeof (__m64) * 3 * wid4); } GET_KERN(); buff0 = pbuff; buff1 = buff0 + wid4; for (j = 0; j < 2; j++) { sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_3x3_1ch(lo, i); } sl += sll; buffT = buff1; buff1 = buff0; buff0 = buffT; } for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; s0 = (*sp++); UNPACK_SRC(d1, lo); for (i = 0; i < width / 8; i++) { CONV_3x3_1ch(hi, 2 * i); s0 = sp[i]; CONV_3x3_1ch(lo, 2 * i + 1); dp[i] = _mm_packs_pu16(res_hi, res_lo); } if (width & 7) { __m64 mask; mask = ((__m64 *) mlib_mask64_arr)[width & 7]; CONV_3x3_1ch(hi, 2 * i); s0 = sp[i]; CONV_3x3_1ch(lo, 2 * i + 1); res_hi = _mm_packs_pu16(res_hi, res_lo); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } buffT = buff1; buff1 = buff0; buff0 = buffT; sl += sll; dl += dll; } _mm_empty(); if (pbuff != buff_loc) mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status CONV_FUNC_I(MxN)(mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 m, mlib_s32 n, mlib_s32 dm, mlib_s32 dn, mlib_s32 scale, mlib_s32 cmask) { mlib_s32 buff[BUFF_SIZE], *buffd = buff; mlib_s32 l, off, kw; mlib_s32 d0, d1, shift1, shift2; mlib_s32 k0, k1, k2, k3, k4, k5, k6; mlib_s32 p0, p1, p2, p3, p4, p5, p6, p7; DTYPE *adr_src, *sl, *sp = NULL; DTYPE *adr_dst, *dl, *dp = NULL; mlib_s32 wid, hgt, sll, dll; mlib_s32 nchannel, chan1; mlib_s32 i, j, c; mlib_s32 chan2; mlib_s32 k_locl[MAX_N*MAX_N], *k = k_locl; GET_SRC_DST_PARAMETERS(DTYPE); #if IMG_TYPE != 1 shift1 = 16; #else shift1 = 8; #endif /* IMG_TYPE != 1 */ shift2 = scale - shift1; chan1 = nchannel; chan2 = chan1 + chan1; wid -= (m - 1); hgt -= (n - 1); adr_dst += dn*dll + dm*nchannel; if (wid > BUFF_SIZE) { buffd = mlib_malloc(sizeof(mlib_s32)*wid); if (buffd == NULL) return MLIB_FAILURE; } if (m*n > MAX_N*MAX_N) { k = mlib_malloc(sizeof(mlib_s32)*(m*n)); if (k == NULL) { if (buffd != buff) mlib_free(buffd); return MLIB_FAILURE; } } for (i = 0; i < m*n; i++) { k[i] = kernel[i] >> shift1; } for (c = 0; c < nchannel; c++) { if (!(cmask & (1 << (nchannel - 1 - c)))) continue; sl = adr_src + c; dl = adr_dst + c; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i < wid; i++) buffd[i] = 0; for (j = 0; j < hgt; j++) { mlib_s32 *pk = k; for (l = 0; l < n; l++) { DTYPE *sp0 = sl + l*sll; for (off = 0; off < m;) { sp = sp0 + off*chan1; dp = dl; kw = m - off; if (kw > 2*MAX_KER) kw = MAX_KER; else if (kw > MAX_KER) kw = kw/2; off += kw; p2 = sp[0]; p3 = sp[chan1]; p4 = sp[chan2]; p5 = sp[chan2 + chan1]; p6 = sp[chan2 + chan2]; p7 = sp[5*chan1]; k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; pk += kw; sp += (kw - 1)*chan1; if (kw == 7) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; p6 = sp[0]; p7 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; p6 = sp[0]; p7 = sp[chan1]; d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else if (kw == 6) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = sp[0]; p6 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = sp[0]; p6 = sp[chan1]; d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]); d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else if (kw == 5) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = sp[0]; p5 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = sp[0]; p5 = sp[chan1]; d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]); d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else if (kw == 4) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; p4 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; p4 = sp[chan1]; d0 = (p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); d1 = (p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else if (kw == 3) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = sp[0]; p3 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1 + p2*k2; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = sp[0]; p3 = sp[chan1]; d0 = (p0*k0 + p1*k1 + p2*k2 + buffd[i ]); d1 = (p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else if (kw == 2) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = sp[0]; p2 = sp[chan1]; buffd[i ] += p0*k0 + p1*k1; buffd[i + 1] += p1*k0 + p2*k1; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = sp[0]; p2 = sp[chan1]; d0 = (p0*k0 + p1*k1 + buffd[i ]); d1 = (p1*k0 + p2*k1 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } else /*if (kw == 1)*/ { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = sp[0]; p1 = sp[chan1]; buffd[i ] += p0*k0; buffd[i + 1] += p1*k0; sp += chan2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = sp[0]; p1 = sp[chan1]; d0 = (p0*k0 + buffd[i ]); d1 = (p1*k0 + buffd[i + 1]); STORE_RES(dp[0 ], d0); STORE_RES(dp[chan1], d1); buffd[i ] = 0; buffd[i + 1] = 0; sp += chan2; dp += chan2; } } } } } /* last pixels */ for (; i < wid; i++) { mlib_s32 *pk = k, s = 0; mlib_s32 x; for (l = 0; l < n; l++) { sp = sl + l*sll + i*chan1; for (x = 0; x < m; x++) { s += sp[0] * pk[0]; sp += chan1; pk ++; } } STORE_RES(dp[0], s); sp += chan1; dp += chan1; } sl += sll; dl += dll; } } if (buffd != buff) mlib_free(buffd); if (k != k_locl) mlib_free(k); return MLIB_SUCCESS; }
mlib_status CONV_FUNC(MxN)(mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 m, mlib_s32 n, mlib_s32 dm, mlib_s32 dn, mlib_s32 scale, mlib_s32 cmask) { FTYPE buff[BUFF_SIZE], *buffs_arr[2*(MAX_N + 1)]; FTYPE **buffs = buffs_arr, *buffd; FTYPE akernel[256], *k = akernel, fscale = DSCALE; mlib_s32 mn, l, off, kw, bsize, buff_ind; mlib_s32 d0, d1; FTYPE k0, k1, k2, k3, k4, k5, k6; FTYPE p0, p1, p2, p3, p4, p5, p6, p7; d64_2x32 dd; DEF_VARS(DTYPE); mlib_s32 chan2; mlib_s32 *buffo, *buffi; mlib_status status = MLIB_SUCCESS; GET_SRC_DST_PARAMETERS(DTYPE); if (scale > 30) { fscale *= 1.0/(1 << 30); scale -= 30; } fscale /= (1 << scale); mn = m*n; if (mn > 256) { k = mlib_malloc(mn*sizeof(mlib_d64)); if (k == NULL) return MLIB_FAILURE; } for (i = 0; i < mn; i++) { k[i] = kernel[i]*fscale; } if (m == 1) { status = mlib_ImageConv1xN(dst, src, k, n, dn, cmask); FREE_AND_RETURN_STATUS; } bsize = (n + 3)*wid; if ((bsize > BUFF_SIZE) || (n > MAX_N)) { pbuff = mlib_malloc(sizeof(FTYPE)*bsize + sizeof(FTYPE *)*2*(n + 1)); if (pbuff == NULL) { status = MLIB_FAILURE; FREE_AND_RETURN_STATUS; } buffs = (FTYPE **)(pbuff + bsize); } for (l = 0; l < (n + 1); l++) buffs[l] = pbuff + l*wid; for (l = 0; l < (n + 1); l++) buffs[l + (n + 1)] = buffs[l]; buffd = buffs[n] + wid; buffo = (mlib_s32*)(buffd + wid); buffi = buffo + (wid &~ 1); chan1 = nchannel; chan2 = chan1 + chan1; wid -= (m - 1); hgt -= (n - 1); adr_dst += dn*dll + dm*nchannel; for (c = 0; c < nchannel; c++) { if (!(cmask & (1 << (chan1 - 1 - c)))) continue; sl = adr_src + c; dl = adr_dst + c; for (l = 0; l < n; l++) { FTYPE *buff = buffs[l]; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i < wid + (m - 1); i++) { buff[i] = (FTYPE)sl[i*chan1]; } sl += sll; } buff_ind = 0; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i < wid; i++) buffd[i] = 0.0; for (j = 0; j < hgt; j++) { FTYPE **buffc = buffs + buff_ind; FTYPE *buffn = buffc[n]; FTYPE *pk = k; for (l = 0; l < n; l++) { FTYPE *buff_l = buffc[l]; for (off = 0; off < m;) { FTYPE *buff = buff_l + off; kw = m - off; if (kw > 2*MAX_KER) kw = MAX_KER; else if (kw > MAX_KER) kw = kw/2; off += kw; sp = sl; dp = dl; p2 = buff[0]; p3 = buff[1]; p4 = buff[2]; p5 = buff[3]; p6 = buff[4]; p7 = buff[5]; k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; k4 = pk[4]; k5 = pk[5]; k6 = pk[6]; pk += kw; if (kw == 7) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; p6 = buff[i + 6]; p7 = buff[i + 7]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = p7; p6 = buff[i + 6]; p7 = buff[i + 7]; LOAD_BUFF(buffi); dd.d64 = *(FTYPE *)(buffi + i); buffn[i ] = (FTYPE)dd.i32s.i0; buffn[i + 1] = (FTYPE)dd.i32s.i1; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + p6*k6 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + p7*k6 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } else if (kw == 6) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = buff[i + 5]; p6 = buff[i + 6]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = p6; p5 = buff[i + 5]; p6 = buff[i + 6]; buffn[i ] = (FTYPE)sp[0]; buffn[i + 1] = (FTYPE)sp[chan1]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + p5*k5 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + p6*k5 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } else if (kw == 5) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = buff[i + 4]; p5 = buff[i + 5]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = p5; p4 = buff[i + 4]; p5 = buff[i + 5]; buffn[i ] = (FTYPE)sp[0]; buffn[i + 1] = (FTYPE)sp[chan1]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + p4*k4 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + p5*k4 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } else if (kw == 4) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = buff[i + 3]; p4 = buff[i + 4]; buffd[i ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = buff[i + 3]; p4 = buff[i + 4]; buffn[i ] = (FTYPE)sp[0]; buffn[i + 1] = (FTYPE)sp[chan1]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } else if (kw == 3) { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = buff[i + 2]; p3 = buff[i + 3]; buffd[i ] += p0*k0 + p1*k1 + p2*k2; buffd[i + 1] += p1*k0 + p2*k1 + p3*k2; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = p3; p2 = buff[i + 2]; p3 = buff[i + 3]; buffn[i ] = (FTYPE)sp[0]; buffn[i + 1] = (FTYPE)sp[chan1]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } else /*if (kw == 2)*/ { if (l < (n - 1) || off < m) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = buff[i + 1]; p2 = buff[i + 2]; buffd[i ] += p0*k0 + p1*k1; buffd[i + 1] += p1*k0 + p2*k1; } } else { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= (wid - 2); i += 2) { p0 = p2; p1 = buff[i + 1]; p2 = buff[i + 2]; buffn[i ] = (FTYPE)sp[0]; buffn[i + 1] = (FTYPE)sp[chan1]; d0 = D2I(p0*k0 + p1*k1 + buffd[i ]); d1 = D2I(p1*k0 + p2*k1 + buffd[i + 1]); dp[0 ] = FROM_S32(d0); dp[chan1] = FROM_S32(d1); buffd[i ] = 0.0; buffd[i + 1] = 0.0; sp += chan2; dp += chan2; } } } } } /* last pixels */ for (; i < wid; i++) { FTYPE *pk = k, s = 0; mlib_s32 x, d0; for (l = 0; l < n; l++) { FTYPE *buff = buffc[l] + i; for (x = 0; x < m; x++) s += buff[x] * (*pk++); } d0 = D2I(s); dp[0] = FROM_S32(d0); buffn[i] = (FTYPE)sp[0]; sp += chan1; dp += chan1; } for (l = 0; l < (m - 1); l++) buffn[wid + l] = sp[l*chan1]; /* next line */ sl += sll; dl += dll; buff_ind++; if (buff_ind >= n + 1) buff_ind = 0; } } FREE_AND_RETURN_STATUS; }
static mlib_status mlib_ImageConv1xN(mlib_image *dst, const mlib_image *src, const mlib_d64 *k, mlib_s32 n, mlib_s32 dn, mlib_s32 cmask) { FTYPE buff[BUFF_SIZE]; mlib_s32 off, kh; mlib_s32 d0, d1; const FTYPE *pk; FTYPE k0, k1, k2, k3; FTYPE p0, p1, p2, p3, p4; DEF_VARS(DTYPE); DTYPE *sl_c, *dl_c, *sl0; mlib_s32 l, hsize, max_hsize; GET_SRC_DST_PARAMETERS(DTYPE); hgt -= (n - 1); adr_dst += dn*dll; max_hsize = (CACHE_SIZE/sizeof(DTYPE))/sll; if (!max_hsize) max_hsize = 1; if (max_hsize > BUFF_SIZE) { pbuff = mlib_malloc(sizeof(FTYPE)*max_hsize); } chan1 = nchannel; sl_c = adr_src; dl_c = adr_dst; for (l = 0; l < hgt; l += hsize) { hsize = hgt - l; if (hsize > max_hsize) hsize = max_hsize; for (c = 0; c < nchannel; c++) { if (!(cmask & (1 << (chan1 - 1 - c)))) continue; sl = sl_c + c; dl = dl_c + c; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < hsize; j++) pbuff[j] = 0.0; for (i = 0; i < wid; i++) { sl0 = sl; for (off = 0; off < (n - 4); off += 4) { pk = k + off; sp = sl0; k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll]; sp += 3*sll; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < hsize; j += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; p4 = sp[sll]; pbuff[j ] += p0*k0 + p1*k1 + p2*k2 + p3*k3; pbuff[j + 1] += p1*k0 + p2*k1 + p3*k2 + p4*k3; sp += 2*sll; } sl0 += 4*sll; } pk = k + off; sp = sl0; k0 = pk[0]; k1 = pk[1]; k2 = pk[2]; k3 = pk[3]; p2 = sp[0]; p3 = sp[sll]; p4 = sp[2*sll]; dp = dl; kh = n - off; if (kh == 4) { sp += 3*sll; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j <= (hsize - 2); j += 2) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; p4 = sp[sll]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + p4*k3 + pbuff[j + 1]); dp[0 ] = FROM_S32(d0); dp[dll] = FROM_S32(d1); pbuff[j] = 0; pbuff[j + 1] = 0; sp += 2*sll; dp += 2*dll; } if (j < hsize) { p0 = p2; p1 = p3; p2 = p4; p3 = sp[0]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + p3*k3 + pbuff[j]); pbuff[j] = 0; dp[0] = FROM_S32(d0); } } else if (kh == 3) { sp += 2*sll; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j <= (hsize - 2); j += 2) { p0 = p2; p1 = p3; p2 = sp[0]; p3 = sp[sll]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]); d1 = D2I(p1*k0 + p2*k1 + p3*k2 + pbuff[j + 1]); dp[0 ] = FROM_S32(d0); dp[dll] = FROM_S32(d1); pbuff[j] = 0; pbuff[j + 1] = 0; sp += 2*sll; dp += 2*dll; } if (j < hsize) { p0 = p2; p1 = p3; p2 = sp[0]; d0 = D2I(p0*k0 + p1*k1 + p2*k2 + pbuff[j]); pbuff[j] = 0; dp[0] = FROM_S32(d0); } } else if (kh == 2) { sp += sll; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j <= (hsize - 2); j += 2) { p0 = p2; p1 = sp[0]; p2 = sp[sll]; d0 = D2I(p0*k0 + p1*k1 + pbuff[j]); d1 = D2I(p1*k0 + p2*k1 + pbuff[j + 1]); dp[0 ] = FROM_S32(d0); dp[dll] = FROM_S32(d1); pbuff[j] = 0; pbuff[j + 1] = 0; sp += 2*sll; dp += 2*dll; } if (j < hsize) { p0 = p2; p1 = sp[0]; d0 = D2I(p0*k0 + p1*k1 + pbuff[j]); pbuff[j] = 0; dp[0] = FROM_S32(d0); } } else /* if (kh == 1) */ { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (j = 0; j < hsize; j++) { p0 = sp[0]; d0 = D2I(p0*k0 + pbuff[j]); dp[0] = FROM_S32(d0); pbuff[j] = 0; sp += sll; dp += dll; } } sl += chan1; dl += chan1; } } sl_c += max_hsize*sll; dl_c += max_hsize*dll; } if (pbuff != buff) mlib_free(pbuff); return MLIB_SUCCESS; }
mlib_status mlib_m_conv5x5_u16nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker[5][5]; __m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000; __m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h, tmph; __m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l, tmpl; __m64 *sp, *dp; mlib_s32 shift, ind, ker_sum = 0; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * 2 * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (__m64 *) sl; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < wid4; i++) { PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < width / 4; i++) { CONV_5x5(hi, i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_5x5(hi, i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_v_conv2x2_u16nw_mask( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon, mlib_s32 cmask) { /* pointers to dst row */ mlib_u16 *da, *d_a; /* pointers to src, dst data */ mlib_u16 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u16 *sa, *sa1, *sa2, *sa_2; /* pointers to rows in interm. src buf */ mlib_u16 *buff_src, *sbuf1, *sbuf2, *prow; mlib_u16 *s_buf1; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1, tmp0, tmp1, tmp2, tmp3; /* data */ mlib_d64 d1, d2, d_1, d_2; /* shifted data */ mlib_d64 d21, d22; /* coefficients */ mlib_f32 k1, k2, k3, k4; int gsr_scale, i, j, nchannel, chan, testchan; mlib_u16 t1, t2, t3, t4, t5, t6, t7, t8; type_mlib_d64 str; mlib_d64 ker_off, mask8000 = vis_to_double_dup(0x80008000); nchannel = mlib_ImageGetChannels(src); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); gsr_scale = 32 - scalef_expon; vis_write_gsr((gsr_scale << 3) + 2); /* buf_slb - 8-byte aligned */ buf_slb = (2 * dw + 26) & (~7); /* alloc. interm. src buffer */ buff_src = (mlib_u16 *)__mlib_malloc(2 * buf_slb * sizeof (mlib_u8) + 8); if (buff_src == NULL) return (MLIB_FAILURE); buf_slb >>= 1; sbuf1 = (mlib_u16 *)((mlib_addr)(buff_src + 8) & (~7)); sbuf2 = sbuf1 + buf_slb; dw -= 1; /* edge - no write */ dh -= 1; testchan = 1; for (chan = nchannel - 1; chan >= 0; chan--) { if ((cmask & testchan) == 0) { testchan <<= 1; continue; } testchan <<= 1; sa = adr_src + chan; sa1 = sa + slb; sa_2 = sa2 = sa1 + slb; d_a = adr_dst + chan; /* load interm. src buff */ for (i = 0, j = 0; j < (dw + 1); i += nchannel, j++) { sbuf1[j] = sa1[i]; sbuf2[j] = sa[i]; } for (j = 0; j < dh - 1; j++) { da = d_a; prow = sbuf1; sbuf1 = sbuf2; sbuf2 = prow; s1 = (mlib_d64 *)sbuf1; s2 = (mlib_d64 *)sbuf2; dend = da + (dw - 1) * nchannel; s_buf1 = sbuf1; d1 = *s1; d2 = *s2; d1 = vis_fxor(d1, mask8000); d2 = vis_fxor(d2, mask8000); d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_1 = vis_fxor(d_1, mask8000); d_2 = vis_fxor(d_2, mask8000); CONV_16_BEGIN(d1, k1); CONV_16(d2, k3); d21 = vis_faligndata(d1, d_1); d22 = vis_faligndata(d2, d_2); CONV_16(d21, k2); CONV_16(d22, k4); str.value = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000); d1 = d_1; d2 = d_2; s1++; s2++; /* * in each iteration store result from prev. iterat. * and load data for processing next row */ #pragma pipeloop(0) for (i = 0; i < dw - 4; i += 4) { t1 = *sa_2; sa_2 += nchannel; t2 = *sa_2; sa_2 += nchannel; d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_1 = vis_fxor(d_1, mask8000); d_2 = vis_fxor(d_2, mask8000); CONV_16_BEGIN(d1, k1); t3 = *sa_2; sa_2 += nchannel; t4 = *sa_2; sa_2 += nchannel; CONV_16(d2, k3); t5 = str.forshort.ushort0; t6 = str.forshort.ushort1; d21 = vis_faligndata(d1, d_1); t7 = str.forshort.ushort2; d22 = vis_faligndata(d2, d_2); t8 = str.forshort.ushort3; CONV_16(d21, k2); (*s_buf1++) = t1; (*s_buf1++) = t2; CONV_16(d22, k4); (*s_buf1++) = t3; (*s_buf1++) = t4; *da = t5; da += nchannel; str.value = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000); *da = t6; da += nchannel; d1 = d_1; d2 = d_2; *da = t7; da += nchannel; s1++; s2++; *da = t8; da += nchannel; } for (; i < dw + 1; i++) { (*s_buf1++) = *sa_2; sa_2 += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort0; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort1; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort2; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort3; } sa_2 = sa2 = sa2 + slb; d_a += dlb; } /* process last row - no need to load data */ da = d_a; prow = sbuf1; sbuf1 = sbuf2; sbuf2 = prow; s1 = (mlib_d64 *)sbuf1; s2 = (mlib_d64 *)sbuf2; dend = da + (dw - 1) * nchannel; d1 = *s1; d2 = *s2; d1 = vis_fxor(d1, mask8000); d2 = vis_fxor(d2, mask8000); d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_1 = vis_fxor(d_1, mask8000); d_2 = vis_fxor(d_2, mask8000); CONV_16_BEGIN(d1, k1); CONV_16(d2, k3); d21 = vis_faligndata(d1, d_1); d22 = vis_faligndata(d2, d_2); CONV_16(d21, k2); CONV_16(d22, k4); d1 = d_1; d2 = d_2; s1++; s2++; #pragma pipeloop(0) for (i = 4; i < dw; i += 4) { str.value = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000); d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_1 = vis_fxor(d_1, mask8000); d_2 = vis_fxor(d_2, mask8000); CONV_16_BEGIN(d1, k1); t5 = str.forshort.ushort0; CONV_16(d2, k3); d21 = vis_faligndata(d1, d_1); t6 = str.forshort.ushort1; d22 = vis_faligndata(d2, d_2); CONV_16(d21, k2); t7 = str.forshort.ushort2; CONV_16(d22, k4); t8 = str.forshort.ushort3; *da = t5; da += nchannel; *da = t6; da += nchannel; *da = t7; da += nchannel; d1 = d_1; d2 = d_2; *da = t8; da += nchannel; s1++; s2++; } str.value = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000); if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort0; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort1; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort2; da += nchannel; } if ((mlib_addr)da <= (mlib_addr)dend) { *da = str.forshort.ushort3; } } __mlib_free(buff_src); return (MLIB_SUCCESS); }
mlib_status mlib_v_conv2x2_u16nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon) { /* pointers to dst row */ mlib_u16 *da, *d_a; /* pointers to src, dst data */ mlib_u16 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u16 *sa, *sa1; /* pointers to rows in interm. src buf */ mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow; /* pointer to row in interm. dst buf */ mlib_d64 *dbuf; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst; /* data */ mlib_d64 d1, d2, d_1, d_2; mlib_f32 k1, k2, k3, k4; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1, tmp0, tmp1, tmp2, tmp3; mlib_d64 *dsa, *dp; mlib_d64 sd0, sd1; mlib_s32 emask; int gsr_scale, i, j; mlib_d64 ker_off, mask8000 = vis_to_double_dup(0x80008000); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); gsr_scale = 32 - scalef_expon; vis_write_gsr((gsr_scale << 3)); buf_slb = (8 * dw + 16) >> 3; PREPARE_INTERM_BUFFERS(); dw -= 1; dw *= 4; dh -= 1; sa = adr_src; sa1 = sa + slb; d_a = adr_dst; /* load interm. src buff */ #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(sbuf2, sa, 4); #pragma pipeloop(0) for (j = 0; j < dh; j++) { LOOP_INI(); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(sbuf2, sa1, 4); d1 = *s1; d2 = *s2; d1 = vis_fxor(d1, mask8000); d2 = vis_fxor(d2, mask8000); #pragma pipeloop(0) for (i = 0; i < dw; i += 4) { d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_1 = vis_fxor(d_1, mask8000); d_2 = vis_fxor(d_2, mask8000); CONV_16_BEGIN(d1, k1); CONV_16(d2, k3); CONV_16(d_1, k2); CONV_16(d_2, k4); (*ddst++) = vis_fxor(vis_fpackfix_pair(out0, out1), mask8000); d1 = d_1; d2 = d_2; s1++; s2++; } PREPARE_TO_COPY_INTERM_BUF_TO_DST(); #pragma pipeloop(0) COPY_INTERM_BUF_TO_DST(); COPY_TAIL(); sa1 = sa1 + slb; d_a += dlb; } __mlib_free(buff_src); return (MLIB_SUCCESS); }
mlib_status mlib_m_sconv5x5_8nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[5]; __m64 *buff0, *buff1, *buff2, *buff3, *buff4, *buffT; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 hker0, hker1, hker2, hker3, hker4; __m64 vker0, vker1, vker2, vker3, vker4; __m64 s0, d0, d1, d2, prev0; __m64 sum0, sum1, sum2, sum3, sum4, aa, bb, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift, ind; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = 2 * ((width + 7) / 8); pbuff = mlib_malloc(sizeof (__m64) * 5 * wid4); GET_KERN(); for (i = 0; i < 5; i++) { buff_arr[i] = pbuff + i * wid4; } for (j = 0; j < 4; j++) { buff4 = buff_arr[j]; sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d2, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_5x5(lo, i); } sl += sll; ind++; } buff0 = buff_arr[0]; buff1 = buff_arr[1]; buff2 = buff_arr[2]; buff3 = buff_arr[3]; buff4 = buff_arr[4]; for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; s0 = (*sp++); UNPACK_SRC(d1, lo); UNPACK_SRC(d2, hi); for (i = 0; i < width / 8; i++) { s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); dp[i] = _mm_packs_pu16(res_lo, res_hi); } if (width & 7) { __m64 mask = ((__m64 *) mlib_mask64_arr)[width & 7]; s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); res_hi = _mm_packs_pu16(res_lo, res_hi); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } buffT = buff0; buff0 = buff1; buff1 = buff2; buff2 = buff3; buff3 = buff4; buff4 = buffT; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_v_conv3x3_8nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon, mlib_s32 cmask) { /* pointers to dst row */ mlib_u8 *da, *d_a; /* pointers to src, dst data */ mlib_u8 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u8 *sa, *sa1, *sa2; /* pointers to rows in interm. src buf */ mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow; /* pointers to rows in interm. src buf */ mlib_d64 *sbuf3; /* pointer to row in interm. dst buf */ mlib_d64 *dbuf; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2, *s3; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst; /* data */ mlib_d64 d1, d2, d_1, d_2, d21, d22; /* data */ mlib_d64 d3, d_3, d23; mlib_f32 k1k2, k3k4, k5k6, k7k8, k9k9; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1; mlib_d64 tmp0, tmp1, rnd; mlib_d64 *dsa, *dp; mlib_d64 sd0, sd1, sd00; mlib_s32 emask, cmask1; mlib_s32 rval, gsr_scale, i, j; gsr_scale = 31 - scalef_expon; vis_write_gsr((gsr_scale << 3)); rval = mlib_round_8[gsr_scale]; rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval)); cmask = ((cmask & 0xf) << 4) + (cmask & 0xf); cmask = (cmask << 8) + (cmask); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); buf_slb = (4 * dw + 24) >> 3; PREPARE_INTERM_BUFFERS(); dw -= 2; dw *= 4; dh -= 2; sa = adr_src; sa1 = sa + slb; sa2 = sa1 + slb; d_a = adr_dst + dlb + 4; /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf2, sa); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf3, sa1); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); #pragma pipeloop(0) for (j = 0; j < dh; j++) { LOOP_INI(); PREPARE_TO_LOAD_LINE(sbuf3, sa2); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(8); vis_alignaddr(s1, 4); d1 = *s1; d2 = *s2; d3 = *s3; #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d_1 = *(s1 + 1); d_2 = *(s2 + 1); d_3 = *(s3 + 1); out0 = out1 = rnd; CONV_AU(d1, k1k2); CONV_AL(d2, k3k4); CONV_AU(d3, k7k8); d21 = vis_faligndata(d1, d_1); d22 = vis_faligndata(d2, d_2); d23 = vis_faligndata(d3, d_3); CONV_AL(d21, k1k2); CONV_AU(d22, k5k6); CONV_AL(d23, k7k8); CONV_AU(d_1, k3k4); CONV_AL(d_2, k5k6); CONV_AU(d_3, k9k9); (*ddst++) = vis_fpack16_pair(out0, out1); d1 = d_1; d2 = d_2; d3 = d_3; s1++; s2++; s3++; } ddst = dbuf; /* prepare the destination addresses */ dp = (mlib_d64 *)((mlib_addr)da & (~7)); i = (mlib_addr)dp - (mlib_addr)da; cmask1 = cmask >> (-i); ddst = vis_alignaddr(ddst, i); /* generate edge mask for the start point */ emask = vis_edge8(da, dend); sd1 = ddst[0]; if (emask != 0xff) { sd0 = sd1; sd1 = ddst[1]; sd0 = vis_faligndata(sd0, sd1); vis_pst_8(sd0, dp++, emask & cmask1); ddst++; i += 8; } #pragma pipeloop(0) for (; i <= (dw - 8); i += 8) { sd0 = sd1; sd1 = ddst[1]; sd00 = vis_faligndata(sd0, sd1); vis_pst_8(sd00, dp++, cmask1); ddst++; } if (i < dw) { sd0 = vis_faligndata(sd1, ddst[1]); emask = vis_edge8(dp, dend); vis_pst_8(sd0, dp, emask & cmask1); } sa2 = sa2 + slb; d_a += dlb; } __mlib_free(buff_src); return (MLIB_SUCCESS); }
mlib_status mlib_m_conv5x5_8nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 ker[5][5]; __m64 s0, d0, d1, d2, d3, d4, prev0, prev1, prev2, prev3, aa, bb, cc; __m64 sum0, sum1, sum2, sum3, sum4, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift, ind; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= (KSIZE - 1); height -= (KSIZE - 1); width *= NCHAN; dl += ((KSIZE - 1) / 2) * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 10 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d2, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d3, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d4, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; s0 = (*sp++); UNPACK_SRC(d1, lo); UNPACK_SRC(d2, hi); s0 = (*sp++); UNPACK_SRC(d3, lo); UNPACK_SRC(d4, hi); for (i = 0; i < width / 8; i++) { s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); dp[i] = _mm_packs_pu16(res_lo, res_hi); } if (width & 7) { __m64 mask; mask = ((__m64 *) mlib_mask64_arr)[width & 7]; s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); res_hi = _mm_packs_pu16(res_lo, res_hi); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_c_conv2x2ext_u8( mlib_image *dst, const mlib_image *src, mlib_s32 dx_l, mlib_s32 dx_r, mlib_s32 dy_t, mlib_s32 dy_b, const mlib_s32 *kern, mlib_s32 scalef_expon, mlib_s32 cmask) { mlib_d64 buff_arr[4 * BUFF_LINE]; mlib_s32 *pbuff = (mlib_s32 *)buff_arr, *buff0, *buff1, *buff2, *buffT; DTYPE *adr_src, *sl, *sp, *sl1; DTYPE *adr_dst, *dl, *dp; mlib_d64 k0, k1, k2, k3, scalef = 1.0; mlib_d64 p00, p01, p02, p10, p11, p12; mlib_s32 wid, hgt, sll, dll, wid1; mlib_s32 nchannel, chan1, chan2; mlib_s32 i, j, c, swid; LOAD_KERNEL_INTO_DOUBLE(); GET_SRC_DST_PARAMETERS(DTYPE); vis_write_gsr(23 << 3); swid = wid + D_KER; wid1 = (swid + 1) & ~1; if (wid1 > BUFF_LINE) { pbuff = __mlib_malloc(4 * sizeof (mlib_s32) * wid1); if (pbuff == NULL) return (MLIB_FAILURE); } buff0 = pbuff; buff1 = buff0 + wid1; buff2 = buff1 + wid1; chan1 = nchannel; chan2 = chan1 + chan1; swid -= dx_r; for (c = 0; c < nchannel; c++) { if (!(cmask & (1 << (nchannel - 1 - c)))) continue; sl = adr_src + c; dl = adr_dst + c; if ((hgt - dy_b) > 0) sl1 = sl + sll; else sl1 = sl; #pragma pipeloop(0) for (i = 0; i < swid; i++) { buff0[i - 1] = (mlib_s32)sl[i * chan1]; buff1[i - 1] = (mlib_s32)sl1[i * chan1]; } if (dx_r != 0) { buff0[swid - 1] = buff0[swid - 2]; buff1[swid - 1] = buff1[swid - 2]; } if ((hgt - dy_b) > 1) sl = sl1 + sll; else sl = sl1; for (j = 0; j < hgt; j++) { sp = sl; dp = dl; buff2[-1] = (mlib_s32)sp[0]; sp += chan1; p02 = buff0[-1]; p12 = buff1[-1]; #pragma pipeloop(0) for (i = 0; i <= (wid - 2); i += 2) { d64_2x32 sd0, sd1; d64_2x32 dd0, dd1; p00 = p02; p10 = p12; sd0.d64 = *(TYPE_64BIT *) (buff0 + i); sd1.d64 = *(TYPE_64BIT *) (buff1 + i); p01 = (mlib_d64)sd0.i32s.i0; p02 = (mlib_d64)sd0.i32s.i1; p11 = (mlib_d64)sd1.i32s.i0; p12 = (mlib_d64)sd1.i32s.i1; LOAD_BUFF(buff2); dd0.i32s.i0 = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3); dd0.i32s.i1 = CLAMP_S32(p01 * k0 + p02 * k1 + p11 * k2 + p12 * k3); dd1.d64 = vis_fpack32(dd1.d64, dd0.d64); STORE2(dd1.i32s.i0, dd1.i32s.i1); sp += chan2; dp += chan2; } for (; i < wid; i++) { d64_2x32 dd0, dd1; p00 = buff0[i - 1]; p10 = buff1[i - 1]; p01 = buff0[i]; p11 = buff1[i]; buff2[i] = (mlib_s32)sp[0]; dd0.i32s.i1 = CLAMP_S32(p00 * k0 + p01 * k1 + p10 * k2 + p11 * k3); dd1.d64 = vis_fpack32(dd1.d64, dd0.d64); dp[0] = dd1.i32s.i1; sp += chan1; dp += chan1; } if (dx_r != 0) buff2[swid - 1] = buff2[swid - 2]; if (j < hgt - dy_b - 2) sl += sll; dl += dll; buffT = buff0; buff0 = buff1; buff1 = buff2; buff2 = buffT; } } if (pbuff != (mlib_s32 *)buff_arr) __mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_v_conv5x5_8nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kernel, mlib_s32 scalef_expon) { /* pointers to dst row */ mlib_u8 *da, *d_a; /* pointers to src, dst data */ mlib_u8 *adr_dst, *adr_src, *dend; /* pointers to src rows */ mlib_u8 *sa, *sa1, *sa2, *sa3, *sa4; /* pointers to rows in interm. src buf */ mlib_d64 *buff_src, *sbuf1, *sbuf2, *prow; /* pointers to rows in interm. src buf */ mlib_d64 *sbuf3, *sbuf4, *sbuf5; /* pointer to row in interm. dst buf */ mlib_d64 *dbuf, *dbuf1; /* mlib_d64 pointers to rows in interm. src buf */ mlib_d64 *s1, *s2, *s3, *s4, *s5; /* mlib_d64 pointer to row in interm. dst buf */ mlib_d64 *ddst; /* data */ mlib_d64 d1, d2, d3, d4, d5; /* data */ mlib_d64 d11, d12, d13, d14, d15; /* data */ mlib_d64 d21, d22, d23, d24, d25; /* data */ mlib_d64 dt_1, dt_2, dt_3, dt_4, dt_5; mlib_f32 k1k2, k3k4, k5k6, k7k8; mlib_f32 k9k10, k11k12, k13k14, k15k16; mlib_f32 k17k18, k19k20, k21k22, k23k24, k25; /* src, dst and interm. buf. strides */ mlib_s32 dlb, slb, buf_slb; mlib_s32 dh, dw; mlib_d64 out0, out1; mlib_d64 tmp0, tmp1, rnd; mlib_d64 *dsa, *dp; mlib_d64 sd0, sd1; mlib_s32 emask; mlib_s32 rval, gsr_scale, i, j; gsr_scale = 31 - scalef_expon; vis_write_gsr((gsr_scale << 3)); rval = mlib_round_8[gsr_scale]; rnd = vis_freg_pair(vis_to_float(rval), vis_to_float(rval)); GET_SRC_DST_PARAMETERS(); LOAD_KERNEL_INTO_FLOAT(); buf_slb = (4 * dw + 24) >> 3; PREPARE_INTERM_BUFFERS(); dw -= 4; dw *= 4; dh -= 4; sa = adr_src; sa1 = sa + slb; sa2 = sa1 + slb; sa3 = sa2 + slb; sa4 = sa3 + slb; d_a = adr_dst + 2 * dlb + 8; /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf2, sa); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf3, sa1); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf4, sa2); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); /* load interm. src buff */ PREPARE_TO_LOAD_LINE(sbuf5, sa3); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER(16); #pragma pipeloop(0) for (j = 0; j < dh; j++) { LOOP_INI(); PREPARE_TO_LOAD_LINE(sbuf5, sa4); #pragma pipeloop(0) LOAD_LINE_INTO_BUFFER_NF(16); vis_alignaddr(s1, 4); dbuf1 = dbuf; d1 = *s1; d2 = *s2; d3 = *s3; d11 = *(s1 + 1); d12 = *(s2 + 1); d13 = *(s3 + 1); #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d21 = *(s1 + 2); d22 = *(s2 + 2); d23 = *(s3 + 2); out0 = out1 = rnd; CONV_AU(d1, k1k2); CONV_AL(d2, k5k6); CONV_AU(d3, k11k12); dt_1 = vis_faligndata(d1, d11); dt_2 = vis_faligndata(d2, d12); dt_3 = vis_faligndata(d3, d13); CONV_AL(dt_1, k1k2); CONV_AU(dt_2, k7k8); CONV_AL(dt_3, k11k12); CONV_AU(d11, k3k4); CONV_AL(d12, k7k8); CONV_AU(d13, k13k14); dt_1 = vis_faligndata(d11, d21); dt_2 = vis_faligndata(d12, d22); dt_3 = vis_faligndata(d13, d23); CONV_AL(dt_1, k3k4); CONV_AU(dt_2, k9k10); CONV_AL(dt_3, k13k14); CONV_AU(d21, k5k6); CONV_AL(d22, k9k10); CONV_AU(d23, k15k16); dbuf1[0] = out0; dbuf1[1] = out1; dbuf1 += 2; d1 = d11; d2 = d12; d3 = d13; d11 = d21; d12 = d22; d13 = d23; s1++; s2++; s3++; } dbuf1 = dbuf; d4 = *s4; d5 = *s5; d14 = *(s4 + 1); d15 = *(s5 + 1); #pragma pipeloop(0) for (i = 0; i < dw; i += 8) { d24 = *(s4 + 2); d25 = *(s5 + 2); out0 = dbuf1[0]; out1 = dbuf1[1]; CONV_AL(d4, k15k16); CONV_AU(d5, k21k22); dt_4 = vis_faligndata(d4, d14); dt_5 = vis_faligndata(d5, d15); CONV_AU(dt_4, k17k18); CONV_AL(dt_5, k21k22); CONV_AL(d14, k17k18); CONV_AU(d15, k23k24); dt_4 = vis_faligndata(d14, d24); dt_5 = vis_faligndata(d15, d25); CONV_AU(dt_4, k19k20); CONV_AL(dt_5, k23k24); CONV_AL(d24, k19k20); CONV_AU(d25, k25); dbuf1 += 2; (*ddst++) = vis_fpack16_pair(out0, out1); d4 = d14; d5 = d15; d14 = d24; d15 = d25; s4++; s5++; } PREPARE_TO_COPY_INTERM_BUF_TO_DST(); #pragma pipeloop(0) COPY_INTERM_BUF_TO_DST(); COPY_TAIL(); sa4 = sa4 + slb; d_a += dlb; } __mlib_free(buff_src); return (MLIB_SUCCESS); }
mlib_status mlib_m_conv3x3_16nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 buff_loc[6 * BUFF_LINE], *pbuff = buff_loc; __m64 *buff0, *buff1, *buff2, *buffT; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker1, ker2, ker3, ker4, ker5, ker6, ker7, ker8, ker9; __m64 d0, d1, d2, rr, tmpa, tmpb; __m64 prev0h, prev1h, sum0h, sum1h, sum2h, tmph; __m64 prev0l, prev1l, sum0l, sum1l, sum2l, tmpl; __m64 *sp, *dp; mlib_s32 shift; mlib_s32 row, wid4, i, j; width -= 2; height -= 2; width *= NCHAN; dl += dll + NCHAN; wid4 = (width + 3) / 4; if (wid4 > BUFF_LINE) { pbuff = mlib_malloc(sizeof (__m64) * 6 * wid4); } GET_KERN(); buff0 = pbuff; buff1 = buff0 + 2 * wid4; buff2 = buff1 + 2 * wid4; for (j = 0; j < 2; j++) { sp = (__m64 *) sl; d1 = (*sp++); d2 = (*sp++); for (i = 0; i < wid4; i++) { PREP_3x3(i); } sl += sll; if (j == 0) { buffT = buff1; buff1 = buff0; buff0 = buffT; } } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; d1 = (*sp++); d2 = (*sp++); for (i = 0; i < width / 4; i++) { CONV_3x3(i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_3x3(i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } buffT = buff1; buff1 = buff0; buff0 = buffT; sl += sll; dl += dll; } _mm_empty(); if (pbuff != buff_loc) mlib_free(pbuff); return (MLIB_SUCCESS); }