mlib_status mlib_m_sconv3x3_16nw_1( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, vker0, vker1, vker2; __m64 s0, s1, s2, v0, v1, aa, bb, rr, rh, rl; __m64 *sp0, *sp1, *sp2, *dp; __m64 zero, _rnd; mlib_s32 shift, kerh_sum; mlib_s32 i, j; width -= 2; height -= 2; width *= NCHAN; dl += dll + NCHAN; GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); dp = (__m64 *) dl; PREP_V(); for (i = 0; i < width / 4; i++) { CONV_3x3(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_3x3(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
void pix_compare :: processYUV_MMX(imageStruct &image, imageStruct &right) { long datasize = image.xsize * image.ysize * image.csize; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; __m64 l, r, b; __m64 mask = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF); __m64 zeros = _mm_set1_pi8((unsigned char)0x00); //format is U Y V Y if (m_direction) { while(datasize--){ l=leftPix[datasize]; r=rightPix[datasize]; b=_mm_subs_pu8(l, r); b=_mm_and_si64(b, mask); b=_mm_cmpeq_pi32(b, zeros); r=_mm_and_si64(r, b); l=_mm_andnot_si64(b, l); leftPix[datasize]=_mm_or_si64(l, r); } } else { while(datasize--){ l=leftPix[datasize]; r=rightPix[datasize]; b=_mm_subs_pu8(r, l); b=_mm_and_si64(b, mask); b=_mm_cmpeq_pi32(b, zeros); r=_mm_and_si64(r, b); l=_mm_andnot_si64(b, l); leftPix[datasize]=_mm_or_si64(l, r); } } _mm_empty(); }
void pix_background :: processRGBAMMX(imageStruct &image) { long i,pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); } m_reset=0; i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Yrange, m_Urange, m_Vrange, m_Arange, m_Yrange, m_Urange, m_Vrange, m_Arange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); __m64 newpix, oldpix, m1; while(i--){ /* 7ops, 3memops */ /* i have the feeling that this is not faster at all! * even if i have the 3memops + ONLY 1 _mm_subs_pu8() * i am equally slow as the generic code; * adding the other instruction does not change much */ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); m1 = _mm_subs_pu8 (m1, thresh); m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh m1 = _mm_andnot_si64(m1, newpix); *data++ = m1; } _mm_empty(); }
mlib_status mlib_ImageMinFilter7x7_S16( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buff4, *buff5, *buffT; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *sp7, *dl; __m64 *dp0, *dp1; __m64 aa, bb, cc, dd, ee, ff, r0, r1; __m64 g0, g1, g2, g3, g4, g5, g6, gg; __m64 h0, h1, h2, h3, h4, h5, h6, hh; __m64 e_mask; mlib_s32 i, j, wid8, tail; wid = (wid - KSIZE1) * SSIZE; wid8 = (wid + 7) & ~7; pbuff = mlib_malloc(KSIZE1 * wid8); buff0 = pbuff; buff1 = buff0 + wid8; buff2 = buff1 + wid8; buff3 = buff2 + wid8; buff4 = buff3 + wid8; buff5 = buff4 + wid8; sl = (mlib_u8 *)src; dl = (mlib_u8 *)dst + (KSIZE1 / 2) * (dlb + SSIZE); tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; for (j = 0; j < 3; j++) { sp0 = buff4; sp1 = buff5; sp6 = sl; sp7 = sl + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; sp0 += 8; sp1 += 8; sp6 += 8; sp7 += 8; } if (j < 2) { buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; } } for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; sp7 = sl + slb; /* * line0: aa * line1: bb * line2: cc * line3: dd * line4: ee * line5: ff * line4: g0 g1 g2 g3 g4 g5 g6 * line5: h0 h1 h2 h3 h4 h5 h6 */ for (i = 0; i <= wid - 8; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; sp6 += 8; sp7 += 8; } if (tail) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); h0 = *(__m64 *) sp7; h1 = *(__m64 *) (sp7 + SSIZE); h2 = *(__m64 *) (sp7 + 2 * SSIZE); h3 = *(__m64 *) (sp7 + 3 * SSIZE); h4 = *(__m64 *) (sp7 + 4 * SSIZE); h5 = *(__m64 *) (sp7 + 5 * SSIZE); h6 = *(__m64 *) (sp7 + 6 * SSIZE); gg = C_COMP(g0, g1); hh = C_COMP(h0, h1); g2 = C_COMP(g2, g3); h2 = C_COMP(h2, h3); g4 = C_COMP(g4, g5); h4 = C_COMP(h4, h5); gg = C_COMP(gg, g2); hh = C_COMP(hh, h2); gg = C_COMP(gg, g4); hh = C_COMP(hh, h4); gg = C_COMP(gg, g6); hh = C_COMP(hh, h6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, hh); *(__m64 *) sp0 = gg; *(__m64 *) sp1 = hh; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } buffT = buff0; buff0 = buff2; buff2 = buff4; buff4 = buffT; buffT = buff1; buff1 = buff3; buff3 = buff5; buff5 = buffT; sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - KSIZE1 - 1)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = buff4; sp5 = buff5; sp6 = sl; for (i = 0; i <= wid - 8; i += 8) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; sp6 += 8; } if (tail) { g0 = *(__m64 *) sp6; g1 = *(__m64 *) (sp6 + SSIZE); g2 = *(__m64 *) (sp6 + 2 * SSIZE); g3 = *(__m64 *) (sp6 + 3 * SSIZE); g4 = *(__m64 *) (sp6 + 4 * SSIZE); g5 = *(__m64 *) (sp6 + 5 * SSIZE); g6 = *(__m64 *) (sp6 + 6 * SSIZE); gg = C_COMP(g0, g1); g2 = C_COMP(g2, g3); g4 = C_COMP(g4, g5); gg = C_COMP(gg, g2); gg = C_COMP(gg, g4); gg = C_COMP(gg, g6); aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; ee = *(__m64 *) sp4; ff = *(__m64 *) sp5; bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); ff = C_COMP(ff, gg); bb = C_COMP(bb, dd); bb = C_COMP(bb, ff); r0 = C_COMP(aa, bb); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_m_conv5x5_u16nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker[5][5]; __m64 d0, d1, d2, aa, bb, rr, tmpa, tmpb, ker_off, mask8000; __m64 prev0h, prev1h, prev2h, prev3h, sum0h, sum1h, sum2h, sum3h, sum4h, tmph; __m64 prev0l, prev1l, prev2l, prev3l, sum0l, sum1l, sum2l, sum3l, sum4l, tmpl; __m64 *sp, *dp; mlib_s32 shift, ind, ker_sum = 0; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 20 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * 2 * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (__m64 *) sl; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < wid4; i++) { PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; d1 = (*sp++); d1 = _mm_xor_si64(d1, mask8000); d2 = (*sp++); d2 = _mm_xor_si64(d2, mask8000); for (i = 0; i < width / 4; i++) { CONV_5x5(hi, i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_5x5(hi, i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_m_sconv5x5_8nw_2( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[5]; __m64 *buff0, *buff1, *buff2, *buff3, *buff4, *buffT; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 hker0, hker1, hker2, hker3, hker4; __m64 vker0, vker1, vker2, vker3, vker4; __m64 s0, d0, d1, d2, prev0; __m64 sum0, sum1, sum2, sum3, sum4, aa, bb, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift, ind; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); wid4 = 2 * ((width + 7) / 8); pbuff = mlib_malloc(sizeof (__m64) * 5 * wid4); GET_KERN(); for (i = 0; i < 5; i++) { buff_arr[i] = pbuff + i * wid4; } for (j = 0; j < 4; j++) { buff4 = buff_arr[j]; sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d2, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_5x5(lo, i); } sl += sll; ind++; } buff0 = buff_arr[0]; buff1 = buff_arr[1]; buff2 = buff_arr[2]; buff3 = buff_arr[3]; buff4 = buff_arr[4]; for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; s0 = (*sp++); UNPACK_SRC(d1, lo); UNPACK_SRC(d2, hi); for (i = 0; i < width / 8; i++) { s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); dp[i] = _mm_packs_pu16(res_lo, res_hi); } if (width & 7) { __m64 mask = ((__m64 *) mlib_mask64_arr)[width & 7]; s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); res_hi = _mm_packs_pu16(res_lo, res_hi); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } buffT = buff0; buff0 = buff1; buff1 = buff2; buff2 = buff3; buff3 = buff4; buff4 = buffT; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
__m64 test54(__m64 a, __m64 b) { // CHECK: pandn return _mm_andnot_si64(a, b); }
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix) { const unsigned char *u = src + ((y-1) & 7)*sc2lines_width, *m = src + ((y+0) & 7)*sc2lines_width, *l = src + ((y+1) & 7)*sc2lines_width; for (unsigned i = 0; i < nPix; i += 4) { if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) { __m64 mm = *(__m64*)(m+i-2); __m64 uu = *(__m64*)(u+i-2); __m64 ll = *(__m64*)(l+i-2); __m64 md = _mm_slli_si64(mm,8); __m64 mf = _mm_srli_si64(mm,8); __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll)); __m64 e0, e1, v1, v2; e0 = _mm_cmpeq_pi8(md,uu); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,uu); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(u+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); e0 = _mm_cmpeq_pi8(md,ll); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,ll); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(l+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); } else { __m64 v1 = _m_from_int(*(unsigned*)(m+i)); v1 = _mm_unpacklo_pi8(v1,v1); *(__m64*)(dst1 + 2*i) = v1; *(__m64*)(dst2 + 2*i) = v1; } } }
/* *********************************************************** */ mlib_status mlib_m_sconv3x3_8nw_1( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { __m64 buff_loc[3 * BUFF_LINE], *pbuff = buff_loc; __m64 *buff0, *buff1, *buffT; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 hker0, hker1, hker2, vker0, vker1, vker2; __m64 s0, d0, d1, sum0, sum1, sum2, aa, bb, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= 2; height -= 2; dl += dll + 1; wid4 = (width + 7) / 4; if (wid4 > BUFF_LINE) { pbuff = mlib_malloc(sizeof (__m64) * 3 * wid4); } GET_KERN(); buff0 = pbuff; buff1 = buff0 + wid4; for (j = 0; j < 2; j++) { sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_3x3_1ch(lo, i); } sl += sll; buffT = buff1; buff1 = buff0; buff0 = buffT; } for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; s0 = (*sp++); UNPACK_SRC(d1, lo); for (i = 0; i < width / 8; i++) { CONV_3x3_1ch(hi, 2 * i); s0 = sp[i]; CONV_3x3_1ch(lo, 2 * i + 1); dp[i] = _mm_packs_pu16(res_hi, res_lo); } if (width & 7) { __m64 mask; mask = ((__m64 *) mlib_mask64_arr)[width & 7]; CONV_3x3_1ch(hi, 2 * i); s0 = sp[i]; CONV_3x3_1ch(lo, 2 * i + 1); res_hi = _mm_packs_pu16(res_hi, res_lo); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } buffT = buff1; buff1 = buff0; buff0 = buffT; sl += sll; dl += dll; } _mm_empty(); if (pbuff != buff_loc) mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_ImageErode4_U16( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* DILATE_FILTER */ { mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl; __m64 *dp0, *dp1; __m64 a1, b0, b1, b2, c0, c1, c2, d1, vv, h0, h1, r0, r1; __m64 mask, mask80; mlib_s32 i, j, tail; sl = (mlib_u8 *)src; /* dst ptrs skip top j and left col */ dl = (mlib_u8 *)dst + dlb + SSIZE; wid = (wid - 2) * SSIZE; tail = wid & 7; mask = ((__m64 *) mlib_mask64_arr)[tail]; mask80 = mmx_from_int_dup(0x80008000); for (j = 0; j <= (hgt - 2 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = sl; sp1 = sp0 + slb; sp2 = sp1 + slb; sp3 = sp2 + slb; /* * line0: a1 * line1: b0 b1 b2 * line2: c0 c1 c2 * line3: d1 */ for (i = 0; i <= wid - 8; i += 8) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c0 = *(__m64 *) (sp2); c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d1 = *(__m64 *) (sp3 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); h1 = C_COMP(c0, c2); r0 = C_COMP(vv, a1); r1 = C_COMP(vv, d1); r0 = C_COMP(r0, h0); r1 = C_COMP(r1, h1); (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; } if (tail) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c0 = *(__m64 *) (sp2); c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d1 = *(__m64 *) (sp3 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); h1 = C_COMP(c0, c2); r0 = C_COMP(vv, a1); r1 = C_COMP(vv, d1); r0 = C_COMP(r0, h0); r1 = C_COMP(r1, h1); *dp0 = _mm_or_si64(_mm_and_si64(mask, r0), _mm_andnot_si64(mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(mask, r1), _mm_andnot_si64(mask, *dp1)); } sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - 3)) { dp0 = (void *)dl; sp0 = sl; sp1 = sp0 + slb; sp2 = sp1 + slb; for (i = 0; i <= wid - 8; i += 8) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c1 = *(__m64 *) (sp2 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); r0 = C_COMP(vv, a1); r0 = C_COMP(r0, h0); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; } if (tail) { a1 = *(__m64 *) (sp0 + SSIZE); b0 = *(__m64 *) (sp1); b1 = *(__m64 *) (sp1 + SSIZE); b2 = *(__m64 *) (sp1 + 2 * SSIZE); c1 = *(__m64 *) (sp2 + SSIZE); vv = C_COMP(b1, c1); h0 = C_COMP(b0, b2); r0 = C_COMP(vv, a1); r0 = C_COMP(r0, h0); *dp0 = _mm_or_si64(_mm_and_si64(mask, r0), _mm_andnot_si64(mask, *dp0)); } } _mm_empty(); return (MLIB_SUCCESS); }
mlib_status mlib_ImageMinFilter5x5_U8( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *pbuff, *buff0, *buff1, *buff2, *buff3, *buffT; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *dl; __m64 *dp0, *dp1; __m64 aa, bb, cc, dd, e0, e1, e2, e3, e4, ee, f0, f1, f2, f3, f4, ff, r0, r1; __m64 e_mask, mask8080; mlib_s32 i, j, wid8, tail; wid = (wid - KSIZE1) * SSIZE; wid8 = (wid + 7) & ~7; pbuff = mlib_malloc(4 * wid8); buff0 = pbuff; buff1 = buff0 + wid8; buff2 = buff1 + wid8; buff3 = buff2 + wid8; sl = (mlib_u8 *)src; dl = (mlib_u8 *)dst + 2 * (dlb + SSIZE); tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; mask8080 = mmx_from_int_dup(0x80808080); for (j = 0; j < 2; j++) { sp0 = buff0; sp1 = buff1; sp4 = sl; sp5 = sl + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; sp0 += 8; sp1 += 8; sp4 += 8; sp5 += 8; } buffT = buff0; buff0 = buff2; buff2 = buffT; buffT = buff1; buff1 = buff3; buff3 = buffT; } for (j = 0; j <= (hgt - KSIZE1 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = sl; sp5 = sl + slb; /* * line0: aa * line1: bb * line2: cc * line3: dd * line4: e0 e1 e2 e3 e4 * line5: f0 f1 f2 f3 f4 */ for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, ff); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; sp5 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); f0 = *(__m64 *) sp5; f1 = *(__m64 *) (sp5 + SSIZE); f2 = *(__m64 *) (sp5 + 2 * SSIZE); f3 = *(__m64 *) (sp5 + 3 * SSIZE); f4 = *(__m64 *) (sp5 + 4 * SSIZE); ee = C_COMP(e0, e1); ff = C_COMP(f0, f1); e2 = C_COMP(e2, e3); f2 = C_COMP(f2, f3); ee = C_COMP(ee, e4); ff = C_COMP(ff, f4); ee = C_COMP(ee, e2); ff = C_COMP(ff, f2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, ff); *(__m64 *) sp0 = ee; *(__m64 *) sp1 = ff; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } buffT = buff0; buff0 = buff2; buff2 = buffT; buffT = buff1; buff1 = buff3; buff3 = buffT; sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - KSIZE1 - 1)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff0; sp1 = buff1; sp2 = buff2; sp3 = buff3; sp4 = sl; for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); ee = C_COMP(e0, e1); e2 = C_COMP(e2, e3); ee = C_COMP(ee, e4); ee = C_COMP(ee, e2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; sp4 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; cc = *(__m64 *) sp2; dd = *(__m64 *) sp3; e0 = *(__m64 *) sp4; e1 = *(__m64 *) (sp4 + SSIZE); e2 = *(__m64 *) (sp4 + 2 * SSIZE); e3 = *(__m64 *) (sp4 + 3 * SSIZE); e4 = *(__m64 *) (sp4 + 4 * SSIZE); ee = C_COMP(e0, e1); e2 = C_COMP(e2, e3); ee = C_COMP(ee, e4); ee = C_COMP(ee, e2); bb = C_COMP(bb, cc); dd = C_COMP(dd, ee); bb = C_COMP(bb, dd); r0 = C_COMP(aa, bb); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) { __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; #ifdef USE_SSE2 __m128i emm0, emm2, emm4; #else __m64 mm0, mm1, mm2, mm3, mm4, mm5; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm2:mm3 */ xmm3 = _mm_movehl_ps(xmm3, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm3); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm4 = mm2; mm5 = mm3; /* get the swap sign flag for the sine */ mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); __m128 swap_sign_bit_sin; COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); /* get the polynom selection mask for the sine */ mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 poly_mask; COPY_MM_TO_XMM(mm2, mm3, poly_mask); #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); #ifdef USE_SSE2 emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); #else /* get the sign flag for the cosine */ mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2); mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2); mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4); mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4); mm4 = _mm_slli_pi32(mm4, 29); mm5 = _mm_slli_pi32(mm5, 29); __m128 sign_bit_cos; COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); _mm_empty(); /* good-bye mmx */ #endif sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m128 z = _mm_mul_ps(x,x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2,ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1,ysin2); xmm2 = _mm_add_ps(y,y2); /* update the sign */ *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }
/* almost the same as sin_ps */ __m128 cos_ps(v4sfu *xPtr) { // any x __m128 x=*((__m128 *)xPtr); __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; #ifdef USE_SSE2 __m128i emm0, emm2; #else __m64 mm0, mm1, mm2, mm3; #endif /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2); /* get the swap sign flag */ emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 sign_bit = _mm_castsi128_ps(emm0); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm0:mm1 */ xmm2 = _mm_movehl_ps(xmm2, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm2); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2); mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2); /* get the swap sign flag in mm0:mm1 and the polynom selection mask in mm2:mm3 */ mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 sign_bit, poly_mask; COPY_MM_TO_XMM(mm0, mm1, sign_bit); COPY_MM_TO_XMM(mm2, mm3, poly_mask); _mm_empty(); /* good-bye mmx */ #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m128*)_ps_coscof_p0; __m128 z = _mm_mul_ps(x,x); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm_and_ps(xmm3, y2); //, xmm3); y = _mm_andnot_ps(xmm3, y); y = _mm_add_ps(y,y2); /* update the sign */ y = _mm_xor_ps(y, sign_bit); return y; }
mlib_status mlib_m_conv3x3_16nw_4( mlib_image *dst, const mlib_image *src, const mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 buff_loc[6 * BUFF_LINE], *pbuff = buff_loc; __m64 *buff0, *buff1, *buff2, *buffT; GET_SRC_DST_PARAMETERS(mlib_s16); __m64 ker1, ker2, ker3, ker4, ker5, ker6, ker7, ker8, ker9; __m64 d0, d1, d2, rr, tmpa, tmpb; __m64 prev0h, prev1h, sum0h, sum1h, sum2h, tmph; __m64 prev0l, prev1l, sum0l, sum1l, sum2l, tmpl; __m64 *sp, *dp; mlib_s32 shift; mlib_s32 row, wid4, i, j; width -= 2; height -= 2; width *= NCHAN; dl += dll + NCHAN; wid4 = (width + 3) / 4; if (wid4 > BUFF_LINE) { pbuff = mlib_malloc(sizeof (__m64) * 6 * wid4); } GET_KERN(); buff0 = pbuff; buff1 = buff0 + 2 * wid4; buff2 = buff1 + 2 * wid4; for (j = 0; j < 2; j++) { sp = (__m64 *) sl; d1 = (*sp++); d2 = (*sp++); for (i = 0; i < wid4; i++) { PREP_3x3(i); } sl += sll; if (j == 0) { buffT = buff1; buff1 = buff0; buff0 = buffT; } } for (row = 0; row < height; row++) { sp = (__m64 *) sl; dp = (__m64 *) dl; d1 = (*sp++); d2 = (*sp++); for (i = 0; i < width / 4; i++) { CONV_3x3(i); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_3x3(i); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } buffT = buff1; buff1 = buff0; buff0 = buffT; sl += sll; dl += dll; } _mm_empty(); if (pbuff != buff_loc) mlib_free(pbuff); return (MLIB_SUCCESS); }
__m64 test_mm_andnot_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_andnot_si64 // CHECK: call x86_mmx @llvm.x86.mmx.pandn return _mm_andnot_si64(a, b); }
void pix_background :: processYUVMMX(imageStruct &image) { long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); // return; } m_reset=0; int i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Urange, m_Yrange, m_Vrange, m_Yrange, m_Urange, m_Yrange, m_Vrange, m_Yrange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); const __m64 black =_mm_set_pi8((unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80); __m64 newpix, oldpix, m1; while(i--){ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); // to make thresh=0 work correctly m1 = _mm_subs_pu8 (m1, thresh); // m1>thresh -> saturation -> 0 m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh oldpix= black; oldpix= _mm_and_si64 (oldpix, m1); m1 = _mm_andnot_si64 (m1, newpix); m1 = _mm_or_si64 (m1, oldpix); *data++ = m1; } _mm_empty(); }
mlib_status mlib_m_sconv5x5_u16nw_3( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, hker3, hker4; __m64 vker0, vker1, vker2, vker3, vker4; __m64 s0, s1, s2, s3, s4, v0, v1, v2, v3, rr, rh, rl; __m64 aa, bb, cc, zero, ker_off, mask8000; __m64 *sp0, *sp1, *sp2, *sp3, *sp4, *dp; mlib_s32 shift, ker_sum, kerh_sum = 0, kerv_sum = 0; mlib_s32 i, j; width -= 4; height -= 4; width *= NCHAN; dl += 2 * (dll + NCHAN); GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); sp3 = (__m64 *) (sl + 3 * sll); sp4 = (__m64 *) (sl + 4 * sll); dp = (__m64 *) dl; PREP_V(); for (i = 0; i < width / 4; i++) { CONV_5x5(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_5x5(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
mlib_status mlib_m_conv5x5_8nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *kern, mlib_s32 scalef_expon) { __m64 *pbuff, *buff_arr[20], **pbuff_arr = buff_arr; __m64 *buff0, *buff1, *buff2, *buff3; GET_SRC_DST_PARAMETERS(mlib_u8); __m64 ker[5][5]; __m64 s0, d0, d1, d2, d3, d4, prev0, prev1, prev2, prev3, aa, bb, cc; __m64 sum0, sum1, sum2, sum3, sum4, res_hi, res_lo; __m64 zero = _m_zero; mlib_s32 shift, ind; mlib_s32 *sp; mlib_s32 row, wid4, i, j; width -= (KSIZE - 1); height -= (KSIZE - 1); width *= NCHAN; dl += ((KSIZE - 1) / 2) * (dll + NCHAN); wid4 = (width + 7) / 4; pbuff = mlib_malloc(sizeof (__m64) * 10 * wid4); GET_KERN(); for (i = 0; i < 10; i++) { buff_arr[i] = pbuff + i * wid4; } ind = 0; for (j = 1; j <= 4; j++) { buff0 = buff_arr[ind]; buff1 = buff_arr[ind + 1]; buff2 = buff_arr[ind + 2]; buff3 = buff_arr[ind + 3]; sp = (mlib_s32 *)sl; *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d1, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d2, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d3, lo); *(mlib_s32 *)&s0 = (*sp++); UNPACK_SRC(d4, lo); for (i = 0; i < wid4; i++) { *(mlib_s32 *)&s0 = sp[i]; PREP_5x5(); } sl += sll; ind += j; } for (row = 0; row < height; row++) { __m64 *sp = (__m64 *) sl; __m64 *dp = (__m64 *) dl; buff0 = pbuff_arr[0]; buff1 = pbuff_arr[2]; buff2 = pbuff_arr[5]; buff3 = pbuff_arr[9]; s0 = (*sp++); UNPACK_SRC(d1, lo); UNPACK_SRC(d2, hi); s0 = (*sp++); UNPACK_SRC(d3, lo); UNPACK_SRC(d4, hi); for (i = 0; i < width / 8; i++) { s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); dp[i] = _mm_packs_pu16(res_lo, res_hi); } if (width & 7) { __m64 mask; mask = ((__m64 *) mlib_mask64_arr)[width & 7]; s0 = sp[i]; CONV_5x5(lo, 2 * i); CONV_5x5(hi, 2 * i + 1); res_hi = _mm_packs_pu16(res_lo, res_hi); dp[i] = _mm_or_si64(_mm_and_si64(mask, res_hi), _mm_andnot_si64(mask, dp[i])); } ind = (pbuff_arr == buff_arr) ? 10 : -10; pbuff_arr[ind + 0] = pbuff_arr[1]; pbuff_arr[ind + 1] = pbuff_arr[3]; pbuff_arr[ind + 2] = pbuff_arr[4]; pbuff_arr[ind + 3] = pbuff_arr[6]; pbuff_arr[ind + 4] = pbuff_arr[7]; pbuff_arr[ind + 5] = pbuff_arr[8]; pbuff_arr[ind + 6] = pbuff_arr[0]; pbuff_arr[ind + 7] = pbuff_arr[2]; pbuff_arr[ind + 8] = pbuff_arr[5]; pbuff_arr[ind + 9] = pbuff_arr[9]; pbuff_arr += ind; sl += sll; dl += dll; } _mm_empty(); mlib_free(pbuff); return (MLIB_SUCCESS); }
mlib_status mlib_m_sconv7x7_16nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, hker3, hker4, hker5, hker6; __m64 vker0, vker1, vker2, vker3, vker4, vker5, vker6; __m64 s0, s1, s2, s3, s4, s5, s6, v0, v1, v2, v3, v4, v5, v6, rr, rh, rl; __m64 zero, _rnd; __m64 *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *dp; mlib_s32 shift, kerh_sum; mlib_s32 i, j; width -= KSIZE1; height -= KSIZE1; width *= NCHAN; dl += (KSIZE / 2) * (dll + NCHAN); GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); sp3 = (__m64 *) (sl + 3 * sll); sp4 = (__m64 *) (sl + 4 * sll); sp5 = (__m64 *) (sl + 5 * sll); sp6 = (__m64 *) (sl + 6 * sll); dp = (__m64 *) dl; PREP_V(v1); PREP_V(v2); PREP_V(v3); PREP_V(v4); PREP_V(v5); PREP_V(v6); for (i = 0; i < width / 4; i++) { CONV_7x7(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_7x7(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
mlib_status mlib_ImageMinFilter3x3_S16( void *dst, void *src, mlib_s32 dlb, mlib_s32 slb, mlib_s32 wid, mlib_s32 hgt) #endif /* MAX_FILTER */ { mlib_u8 *buff, *buff1; mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl; __m64 *dp0, *dp1; __m64 aa, bb, c0, c1, c2, cc, d0, d1, d2, dd, r0, r1; __m64 e_mask; mlib_s32 i, j, wid8, tail; wid = (wid - 2) * SSIZE; wid8 = (wid + 7) & ~7; buff = mlib_malloc(2 * wid8); buff1 = buff + wid8; sl = (mlib_u8 *)src; /* dst ptrs skip top j and left col */ dl = (mlib_u8 *)dst + dlb + SSIZE; tail = wid & 7; e_mask = ((__m64 *) mlib_mask64_arr)[tail]; sp0 = buff; sp1 = buff1; sp2 = sl; sp3 = sp2 + slb; sl += 2 * slb; for (i = 0; i < wid; i += 8) { c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d0 = *(__m64 *) sp3; d1 = *(__m64 *) (sp3 + SSIZE); d2 = *(__m64 *) (sp3 + 2 * SSIZE); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); *(__m64 *) sp0 = cc; *(__m64 *) sp1 = dd; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; } for (j = 0; j <= (hgt - 2 - 2); j += 2) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff; sp1 = buff1; sp2 = sl; sp3 = sp2 + slb; /* * line0: aa * line1: bb * line2: c0 c1 c2 * line3: d0 d1 d2 */ for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d0 = *(__m64 *) sp3; d1 = *(__m64 *) (sp3 + SSIZE); d2 = *(__m64 *) (sp3 + 2 * SSIZE); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); bb = C_COMP(bb, cc); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, dd); *(__m64 *) sp0 = cc; *(__m64 *) sp1 = dd; (*dp0++) = r0; (*dp1++) = r1; sp0 += 8; sp1 += 8; sp2 += 8; sp3 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); d0 = *(__m64 *) sp3; d1 = *(__m64 *) (sp3 + SSIZE); d2 = *(__m64 *) (sp3 + 2 * SSIZE); cc = C_COMP(c0, c1); dd = C_COMP(d0, d1); cc = C_COMP(cc, c2); dd = C_COMP(dd, d2); bb = C_COMP(bb, cc); r0 = C_COMP(aa, bb); r1 = C_COMP(bb, dd); *(__m64 *) sp0 = cc; *(__m64 *) sp1 = dd; *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); *dp1 = _mm_or_si64(_mm_and_si64(e_mask, r1), _mm_andnot_si64(e_mask, *dp1)); } sl += 2 * slb; dl += 2 * dlb; } /* last line */ if (j == (hgt - 3)) { dp0 = (void *)dl; dp1 = (void *)(dl + dlb); sp0 = buff; sp1 = buff1; sp2 = sl; for (i = 0; i <= wid - 8; i += 8) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); cc = C_COMP(c0, c1); cc = C_COMP(cc, c2); r0 = C_COMP(aa, bb); r0 = C_COMP(r0, cc); (*dp0++) = r0; sp0 += 8; sp1 += 8; sp2 += 8; } if (tail) { aa = *(__m64 *) sp0; bb = *(__m64 *) sp1; c0 = *(__m64 *) sp2; c1 = *(__m64 *) (sp2 + SSIZE); c2 = *(__m64 *) (sp2 + 2 * SSIZE); c1 = C_COMP(c0, c1); cc = C_COMP(c1, c2); r0 = C_COMP(aa, bb); r0 = C_COMP(r0, cc); *dp0 = _mm_or_si64(_mm_and_si64(e_mask, r0), _mm_andnot_si64(e_mask, *dp0)); } } _mm_empty(); mlib_free(buff); return (MLIB_SUCCESS); }