void r_dimpatchD_MMX(const DCanvas *const cvs, argb_t color, int alpha, int x1, int y1, int w, int h) { int x, y, i; argb_t *line; int invAlpha = 256 - alpha; int dpitch = cvs->pitch / sizeof(DWORD); line = (argb_t *)cvs->buffer + y1 * dpitch; int batches = w / 2; int remainder = w & 1; // MMX temporaries: const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff); const __m64 blendAlpha = _mm_set_pi16(0, alpha, alpha, alpha); const __m64 blendInvAlpha = _mm_set_pi16(0, invAlpha, invAlpha, invAlpha); const __m64 blendColor = _mm_set_pi16(0, RPART(color), GPART(color), BPART(color)); const __m64 blendMult = _mm_mullo_pi16(blendColor, blendAlpha); for (y = y1; y < y1 + h; y++) { // MMX optimize the bulk in batches of 2 colors: for (i = 0, x = x1; i < batches; ++i, x += 2) { #if 1 const __m64 input = _mm_setr_pi32(line[x + 0], line[x + 1]); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. const __m64 input = *((__m64 *)line[x]); #endif const __m64 output = blend2vs1_mmx(input, blendMult, blendInvAlpha, upper8mask); #if 1 line[x+0] = _mm_cvtsi64_si32(_mm_srli_si64(output, 32*0)); line[x+1] = _mm_cvtsi64_si32(_mm_srli_si64(output, 32*1)); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. *((__m64 *)line[x]) = output; #endif } if (remainder) { // Pick up the remainder: for (; x < x1 + w; x++) { line[x] = alphablend1a(line[x], color, alpha); } } line += dpitch; } // Required to reset FP: _mm_empty(); }
__m64 unsigned_add3 (const __m64 * a, const __m64 * b, __m64 * result, unsigned int count) { __m64 _a, _b, one, sum, carry, onesCarry; unsigned int i; carry = _mm_setzero_si64 (); one = _mm_cmpeq_pi8 (carry, carry); one = _mm_sub_si64 (carry, one); for (i = 0; i < count; i++) { _a = a[i]; _b = b[i]; sum = _mm_add_si64 (_a, _b); sum = _mm_add_si64 (sum, carry); result[i] = sum; onesCarry = _mm_and_si64 (_mm_xor_si64 (_a, _b), carry); onesCarry = _mm_or_si64 (_mm_and_si64 (_a, _b), onesCarry); onesCarry = _mm_and_si64 (onesCarry, one); _a = _mm_srli_si64 (_a, 1); _b = _mm_srli_si64 (_b, 1); carry = _mm_add_si64 (_mm_add_si64 (_a, _b), onesCarry); carry = _mm_srli_si64 (carry, 63); } return carry; }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YV12 to RGB24. VOID Yv12ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcYDif; INT iSrcUvDif; INT yy, bu, guv, rv; M64 y0, y1, u0, v0, mz; M64 r0, g0, b0, r1, g1, b1; M64 rgb0, rgb1, rgb2, rgb3; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbDstX += (iHeight - 1) * iDstXStride; iDstXStride = -iDstXStride; } iDstXDif = iDstXStride - (uWidth * 3); iSrcYDif = iSrcYStride - uWidth; iSrcUvDif = iSrcUvStride - (uWidth / 2); mz = _mm_setzero_si64(); for (y = iHeight / 2; y; y--) { for (x = uWidth / 8; x; x--) { // Calculate coefficient. u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU)); // [ | u3 u2 u1 u0] v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV)); // [ | v3 v2 v1 v0] u0 = _mm_unpacklo_pi8(u0, mz); // u3 u2 u1 u0 v0 = _mm_unpacklo_pi8(v0, mz); // v3 v2 v1 v0 u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y0 = *((PM64) pbSrcY); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 0. *((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3); // Process for row 1. y0 = *((PM64) (pbSrcY + iSrcYStride)); // [YYYY | YYYY]; row 1 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 1. *((PDWORD) (pbDstX + iDstXStride + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + iDstXStride + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + iDstXStride + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + iDstXStride + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + iDstXStride + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + iDstXStride + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + iDstXStride + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + iDstXStride + 21)) = _mm_cvtsi64_si32(rgb3); pbDstX += 24; pbSrcY += 8; pbSrcU += 4; pbSrcV += 4; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcU[0]]; guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]]; rv = g_iRVTab[pbSrcV[0]]; yy = g_iYYTab[pbSrcY[0]]; pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[1]]; pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[iSrcYStride]]; pbDstX[iDstXStride + 0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[iDstXStride + 1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[iDstXStride + 2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[iSrcYStride + 1]]; pbDstX[iDstXStride + 3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[iDstXStride + 4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[iDstXStride + 5] = _Clip((yy + rv) >> SCALEBITS_OUT); pbDstX += 6; pbSrcY += 2; pbSrcU++; pbSrcV++; } pbDstX += iDstXDif + iDstXStride; pbSrcY += iSrcYDif + iSrcYStride; pbSrcU += iSrcUvDif; pbSrcV += iSrcUvDif; } _mm_empty(); }
void rtv_lucent4cols_MMX(byte *source, argb_t *dest, int bga, int fga) { // SSE2 temporaries: const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff); const __m64 fgAlpha = _mm_set_pi16(0, fga, fga, fga); const __m64 bgAlpha = _mm_set_pi16(0, bga, bga, bga); #if 1 const __m64 bgColors01 = _mm_setr_pi32(dest[0], dest[1]); #else const __m64 bgColors01 = *((__m64 *)&dest[0]); #endif const __m64 fgColors01 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[0]), rt_mapcolor<argb_t>(dcol.colormap, source[1]) ); const __m64 finalColors01 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ) ); #if 1 const __m64 bgColors23 = _mm_setr_pi32(dest[2], dest[3]); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. const __m64 bgColors23 = *((__m64 *)&dest[2]); #endif const __m64 fgColors23 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[2]), rt_mapcolor<argb_t>(dcol.colormap, source[3]) ); const __m64 finalColors23 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ) ); #if 1 dest[0] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*0)); dest[1] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*1)); dest[2] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*0)); dest[3] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*1)); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. *((__m64 *)&dest[0]) = finalColors01; *((__m64 *)&dest[2]) = finalColors23; #endif // Required to reset FP: _mm_empty(); }
__m64 test72(__m64 a) { // CHECK: psrlq return _mm_srli_si64(a, 3); }
void mlib_m_ImageFlipY_U8_1( mlib_image *dst, const mlib_image *src) { PREPAREVARS(mlib_u8, 1); if (sw < 16) { STRIP_1(ddata, sdata, sw, sh, mlib_u8); return; } for (i = 0; i < sh; i++) { mlib_u8 *psrc_row = sdata + i * sstride + sw - 1; mlib_u8 *pdst_row = ddata + i * dstride; __m64 s0, s1, d0, d1; s0 = *(__m64 *) (psrc_row - 7); s1 = *(__m64 *) (psrc_row - 15); for (j = 0; j <= (sw - 32); j += 16) { d0 = _mm_unpacklo_pi8(_mm_srli_si64(s0, 32), s0); d1 = _mm_unpacklo_pi8(_mm_srli_si64(s1, 32), s1); d0 = _mm_unpacklo_pi8(_mm_srli_si64(d0, 32), d0); d1 = _mm_unpacklo_pi8(_mm_srli_si64(d1, 32), d1); d0 = _mm_unpacklo_pi8(_mm_srli_si64(d0, 32), d0); d1 = _mm_unpacklo_pi8(_mm_srli_si64(d1, 32), d1); s0 = *(__m64 *) (psrc_row - 23); s1 = *(__m64 *) (psrc_row - 31); *(__m64 *) (pdst_row) = d0; *(__m64 *) (pdst_row + 8) = d1; psrc_row -= 16; pdst_row += 16; } if (j <= (sw - 16)) { d0 = _mm_unpacklo_pi8(_mm_srli_si64(s0, 32), s0); d1 = _mm_unpacklo_pi8(_mm_srli_si64(s1, 32), s1); d0 = _mm_unpacklo_pi8(_mm_srli_si64(d0, 32), d0); d1 = _mm_unpacklo_pi8(_mm_srli_si64(d1, 32), d1); d0 = _mm_unpacklo_pi8(_mm_srli_si64(d0, 32), d0); d1 = _mm_unpacklo_pi8(_mm_srli_si64(d1, 32), d1); *(__m64 *) (pdst_row) = d0; *(__m64 *) (pdst_row + 8) = d1; psrc_row -= 16; pdst_row += 16; j += 16; } for (; j < sw; j++) { (*pdst_row++) = *psrc_row--; } } _mm_empty(); }
void mlib_m_ImageFlipY_S16_1( mlib_image *dst, const mlib_image *src) { PREPAREVARS(mlib_u16, 1); if (sw < 8) { STRIP_1(ddata, sdata, sw, sh, mlib_u16); return; } sdata += sw - 1; for (i = 0; i < sh; i++) { mlib_u16 *psrc_row = sdata; mlib_u16 *pdst_row = ddata; __m64 s0, s1, d0, d1; s0 = *(__m64 *) (psrc_row - 3); s1 = *(__m64 *) (psrc_row - 7); for (j = 0; j <= (sw - 16); j += 8) { d0 = _mm_unpacklo_pi16(_mm_srli_si64(s0, 32), s0); d1 = _mm_unpacklo_pi16(_mm_srli_si64(s1, 32), s1); d0 = _mm_unpacklo_pi16(_mm_srli_si64(d0, 32), d0); d1 = _mm_unpacklo_pi16(_mm_srli_si64(d1, 32), d1); s0 = *(__m64 *) (psrc_row - 11); s1 = *(__m64 *) (psrc_row - 15); *(__m64 *) (pdst_row) = d0; *(__m64 *) (pdst_row + 4) = d1; psrc_row -= 8; pdst_row += 8; } if (j <= (sw - 8)) { d0 = _mm_unpacklo_pi16(_mm_srli_si64(s0, 32), s0); d1 = _mm_unpacklo_pi16(_mm_srli_si64(s1, 32), s1); d0 = _mm_unpacklo_pi16(_mm_srli_si64(d0, 32), d0); d1 = _mm_unpacklo_pi16(_mm_srli_si64(d1, 32), d1); *(__m64 *) (pdst_row) = d0; *(__m64 *) (pdst_row + 4) = d1; psrc_row -= 8; pdst_row += 8; j += 8; } for (; j < sw; j++) { (*pdst_row++) = *psrc_row--; } sdata = (void *)((mlib_u8 *)sdata + sstride); ddata = (void *)((mlib_u8 *)ddata + dstride); } _mm_empty(); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YUY2 to RGB24. VOID Yuy2ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcXDif; INT yy, bu, guv, rv; M64 y0, y1, u0, v0, uv_temp1, uv_temp2, mz; M64 r0, g0, b0, r1, g1, b1; M64 rgb0, rgb1, rgb2, rgb3; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbSrcX += (iHeight - 1) * iSrcXStride; iSrcXStride = -iSrcXStride; } iDstXDif = iDstXStride - (uWidth * 3); iSrcXDif = iSrcXStride - (uWidth * 2); mz = _mm_setzero_si64(); for (y = iHeight; y; y--) { for (x = uWidth / 8; x; x--) { y0 = ((PM64) pbSrcX)[0]; y1 = ((PM64) pbSrcX)[1]; u0 = y0; v0 = y1; y0 = _mm_and_si64(y0, g_mWord00FF); y1 = _mm_and_si64(y1, g_mWord00FF); u0 = _mm_srli_pi16(u0, 8); v0 = _mm_srli_pi16(v0, 8); uv_temp1 = _mm_srli_pi32(u0, 16); u0 = _mm_slli_pi32(u0, 16); u0 = _mm_srli_pi32(u0, 16); uv_temp2 = _mm_srli_pi32(v0, 16); v0 = _mm_slli_pi32(v0, 16); v0 = _mm_srli_pi32(v0, 16); u0 = _mm_packs_pi32(u0, v0); v0 = _mm_packs_pi32(uv_temp1, uv_temp2); // Calculate coefficient. u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 0. *((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3); pbDstX += 24; pbSrcX += 16; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcX[1]]; guv = g_iGUTab[pbSrcX[1]] + g_iGVTab[pbSrcX[3]]; rv = g_iRVTab[pbSrcX[3]]; yy = g_iYYTab[pbSrcX[0]]; pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcX[2]]; pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT); pbDstX += 6; pbSrcX += 4; } pbDstX += iDstXDif; pbSrcX += iSrcXDif; } _mm_empty(); }
void uyvy_to_yuv422(int width, int height, int shift_picture_down, const uint8_t *input, uint8_t *output) { __m64 chroma_mask = _mm_set_pi8(255, 0, 255, 0, 255, 0, 255, 0); __m64 luma_mask = _mm_set_pi8(0, 255, 0, 255, 0, 255, 0, 255); const uint8_t *orig_input = input; uint8_t *y_comp = output; uint8_t *u_comp = output + width * height; uint8_t *v_comp = u_comp + (int)((width * height)/2); // 4:2:2 int i, j; // When preparing video for PAL DV50 encoding, the video must be shifted // down by one line to change the field order to be bottom-field-first int start_line = 0; if (shift_picture_down) { memset(y_comp, 0x10, width); // write one line of black Y y_comp += width; memset(u_comp, 0x80, width/2); // write one line of black U,V u_comp += width/2; memset(v_comp, 0x80, width/2); // write one line of black U,V v_comp += width/2; start_line = 1; } /* Do the y component */ for (j = start_line; j < height; j++) { // Consume 16 bytes of UYVY data per iteration (8 pixels worth) for (i = 0; i < width*2; i += 16) { //__m64 m1 = _mm_and_si64 (*(__m64 *)input, luma_mask); //__m64 m2 = _mm_and_si64 (*(__m64 *)(input+8), luma_mask); //__m64 m2 = _mm_set_pi8 (0, 0, 0, 0, 0, 0, 0, 0); //*(__m64 *)y_comp = _mm_packs_pu16 (m2, m1); __m64 m0 = *(__m64 *)input; __m64 m2 = _mm_srli_si64(m0, 8); __m64 m3 = _mm_slli_si64(m0, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m0 = m2; __m64 m1 = *(__m64 *)(input+8); m2 = _mm_srli_si64(m1, 8); m3 = _mm_slli_si64(m1, 8); m3 = _mm_and_si64 (m3, chroma_mask); m2 = _mm_and_si64 (m2, luma_mask); m2 = _mm_or_si64 (m2, m3); m2= _mm_and_si64 (m2, luma_mask); m1 = m2; *(__m64 *)y_comp = _mm_packs_pu16 (m0, m1); y_comp += 8; input += 16; } } /* Do the chroma components */ input = orig_input; for (j = start_line; j < height; j++) { /* Process every line for yuv 4:2:2 */ for (i = 0; i < width*2; i += 16) { __m64 m1 = _mm_unpacklo_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m2 = _mm_unpackhi_pi8 (*(__m64 *)input, *(__m64 *)(input+8)); __m64 m3 = _mm_unpacklo_pi8 (m1, m2); __m64 m4 = _mm_unpackhi_pi8 (m1, m2); //*(__m64 *)u_comp = _mm_unpacklo_pi8 (m1, m2); //*(__m64 *)v_comp = _mm_unpackhi_pi8 (m1, m2); memcpy (u_comp, &m3, 4); memcpy (v_comp, &m4, 4); u_comp += 4; v_comp += 4; input += 16; } } _mm_empty(); // Clear aliased fp register state }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YUY2 to YV12. VOID Yuy2ToYv12_mmx(PBYTE pbDstY, PBYTE pbDstU, PBYTE pbDstV, INT iDstYStride, INT iDstUvStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iSrcXDif; INT iDstYDif; INT iDstUvDif; M64 m0, m1, m2, m3, m4, m5, m6, m7; if (iHeight < 0) { iHeight = -iHeight; pbSrcX += (iHeight - 1) * iSrcXStride; iSrcXStride = -iSrcXStride; } iSrcXDif = iSrcXStride - (uWidth * 2); iDstYDif = iDstYStride - uWidth; iDstUvDif = iDstUvStride - (uWidth / 2); m7 = g_mWord00FF; for (y = iHeight / 2; y; y--) { for (x = uWidth / 8; x; x--) { m0 = ((PM64) pbSrcX)[0]; m1 = ((PM64) pbSrcX)[1]; m2 = ((PM64) (pbSrcX + iSrcXStride))[0]; m3 = ((PM64) (pbSrcX + iSrcXStride))[1]; m4 = m0; m5 = m2; m4 = _mm_srli_pi16(m4, 8); m5 = _mm_srli_pi16(m5, 8); m4 = _mm_and_si64(m4, m7); m5 = _mm_and_si64(m5, m7); m4 = _mm_add_pi16(m4, m5); m5 = m1; m6 = m3; m5 = _mm_srli_pi16(m5, 8); m6 = _mm_srli_pi16(m6, 8); m5 = _mm_and_si64(m5, m7); m6 = _mm_and_si64(m6, m7); m5 = _mm_add_pi16(m5, m6); m4 = _mm_add_pi16(m4, g_mWord0001); m5 = _mm_add_pi16(m5, g_mWord0001); m4 = _mm_srli_pi16(m4, 1); m5 = _mm_srli_pi16(m5, 1); m0 = _mm_and_si64(m0, m7); m1 = _mm_and_si64(m1, m7); m2 = _mm_and_si64(m2, m7); m3 = _mm_and_si64(m3, m7); m0 = _mm_packs_pu16(m0, m1); m2 = _mm_packs_pu16(m2, m3); ((PM64) pbDstY)[0] = m0; ((PM64) (pbDstY + iDstYStride))[0] = m2; m4 = _mm_packs_pu16(m4, m5); m5 = m4; m4 = _mm_srli_si64(m4, 8); m5 = _mm_and_si64(m5, m7); m4 = _mm_and_si64(m4, m7); m5 = _mm_packs_pu16(m5, m5); m4 = _mm_packs_pu16(m4, m4); ((PDWORD) pbDstU)[0] = _mm_cvtsi64_si32(m5); ((PDWORD) pbDstV)[0] = _mm_cvtsi64_si32(m4); pbSrcX += 16; pbDstY += 8; pbDstU += 4; pbDstV += 4; } for (x = (uWidth & 7) / 2; x; x--) { pbDstY[0] = pbSrcX[0]; pbDstU[0] = (pbSrcX[1] + pbSrcX[iSrcXStride + 1] + 1) / 2; pbDstY[1] = pbSrcX[2]; pbDstV[0] = (pbSrcX[3] + pbSrcX[iSrcXStride + 3] + 1) / 2; pbDstY[iDstYStride + 0] = pbSrcX[iSrcXStride + 0]; pbDstY[iDstYStride + 1] = pbSrcX[iSrcXStride + 2]; pbSrcX += 4; pbDstY += 2; pbDstU++; pbDstV++; } pbSrcX += iSrcXDif + iSrcXStride; pbDstY += iDstYDif + iDstYStride; pbDstU += iDstUvDif; pbDstV += iDstUvDif; } _mm_empty(); }
void sha384Process(register sha384Param* sp) { #ifdef OPTIMIZE_SSE2 # if defined(_MSC_VER) || defined (__INTEL_COMPILER) static const __m64 MASK = { 0x00FF00FF00FF00FF00 }; # elif defined(__GNUC__) static const __m64 MASK = { 0x00FF00FF, 0x00FF00FF }; # else # error # endif __m64 a, b, c, d, e, f, g, h, temp; register __m64 *w; register const __m64 *k; register byte t; w = (__m64*) sp->data; t = 16; while (t--) { temp = *w; *(w++) = _m_pxor( _mm_slli_si64(_m_pshufw(_m_pand(temp, MASK), 27), 8), _m_pshufw(_m_pand(_mm_srli_si64(temp, 8), MASK), 27) ); } t = 64; while (t--) { temp = _mm_add_si64(_mm_add_si64(sig1(w[-2]), w[-7]), _mm_add_si64(sig0(w[-15]), w[-16])); *(w++) = temp; } w = (__m64*) sp->h; a = w[0]; b = w[1]; c = w[2]; d = w[3]; e = w[4]; f = w[5]; g = w[6]; h = w[7]; w = (__m64*) sp->data; k = (__m64*) SHA2_64BIT_K; #else register uint64_t a, b, c, d, e, f, g, h, temp; register uint64_t *w; register const uint64_t *k; register byte t; # if WORDS_BIGENDIAN w = sp->data + 16; # else w = sp->data; t = 16; while (t--) { temp = swapu64(*w); *(w++) = temp; } # endif t = 64; while (t--) { temp = sig1(w[-2]) + w[-7] + sig0(w[-15]) + w[-16]; *(w++) = temp; } w = sp->data; a = sp->h[0]; b = sp->h[1]; c = sp->h[2]; d = sp->h[3]; e = sp->h[4]; f = sp->h[5]; g = sp->h[6]; h = sp->h[7]; k = SHA2_64BIT_K; #endif ROUND(a,b,c,d,e,f,g,h,w[ 0],k[ 0]); ROUND(h,a,b,c,d,e,f,g,w[ 1],k[ 1]); ROUND(g,h,a,b,c,d,e,f,w[ 2],k[ 2]); ROUND(f,g,h,a,b,c,d,e,w[ 3],k[ 3]); ROUND(e,f,g,h,a,b,c,d,w[ 4],k[ 4]); ROUND(d,e,f,g,h,a,b,c,w[ 5],k[ 5]); ROUND(c,d,e,f,g,h,a,b,w[ 6],k[ 6]); ROUND(b,c,d,e,f,g,h,a,w[ 7],k[ 7]); ROUND(a,b,c,d,e,f,g,h,w[ 8],k[ 8]); ROUND(h,a,b,c,d,e,f,g,w[ 9],k[ 9]); ROUND(g,h,a,b,c,d,e,f,w[10],k[10]); ROUND(f,g,h,a,b,c,d,e,w[11],k[11]); ROUND(e,f,g,h,a,b,c,d,w[12],k[12]); ROUND(d,e,f,g,h,a,b,c,w[13],k[13]); ROUND(c,d,e,f,g,h,a,b,w[14],k[14]); ROUND(b,c,d,e,f,g,h,a,w[15],k[15]); ROUND(a,b,c,d,e,f,g,h,w[16],k[16]); ROUND(h,a,b,c,d,e,f,g,w[17],k[17]); ROUND(g,h,a,b,c,d,e,f,w[18],k[18]); ROUND(f,g,h,a,b,c,d,e,w[19],k[19]); ROUND(e,f,g,h,a,b,c,d,w[20],k[20]); ROUND(d,e,f,g,h,a,b,c,w[21],k[21]); ROUND(c,d,e,f,g,h,a,b,w[22],k[22]); ROUND(b,c,d,e,f,g,h,a,w[23],k[23]); ROUND(a,b,c,d,e,f,g,h,w[24],k[24]); ROUND(h,a,b,c,d,e,f,g,w[25],k[25]); ROUND(g,h,a,b,c,d,e,f,w[26],k[26]); ROUND(f,g,h,a,b,c,d,e,w[27],k[27]); ROUND(e,f,g,h,a,b,c,d,w[28],k[28]); ROUND(d,e,f,g,h,a,b,c,w[29],k[29]); ROUND(c,d,e,f,g,h,a,b,w[30],k[30]); ROUND(b,c,d,e,f,g,h,a,w[31],k[31]); ROUND(a,b,c,d,e,f,g,h,w[32],k[32]); ROUND(h,a,b,c,d,e,f,g,w[33],k[33]); ROUND(g,h,a,b,c,d,e,f,w[34],k[34]); ROUND(f,g,h,a,b,c,d,e,w[35],k[35]); ROUND(e,f,g,h,a,b,c,d,w[36],k[36]); ROUND(d,e,f,g,h,a,b,c,w[37],k[37]); ROUND(c,d,e,f,g,h,a,b,w[38],k[38]); ROUND(b,c,d,e,f,g,h,a,w[39],k[39]); ROUND(a,b,c,d,e,f,g,h,w[40],k[40]); ROUND(h,a,b,c,d,e,f,g,w[41],k[41]); ROUND(g,h,a,b,c,d,e,f,w[42],k[42]); ROUND(f,g,h,a,b,c,d,e,w[43],k[43]); ROUND(e,f,g,h,a,b,c,d,w[44],k[44]); ROUND(d,e,f,g,h,a,b,c,w[45],k[45]); ROUND(c,d,e,f,g,h,a,b,w[46],k[46]); ROUND(b,c,d,e,f,g,h,a,w[47],k[47]); ROUND(a,b,c,d,e,f,g,h,w[48],k[48]); ROUND(h,a,b,c,d,e,f,g,w[49],k[49]); ROUND(g,h,a,b,c,d,e,f,w[50],k[50]); ROUND(f,g,h,a,b,c,d,e,w[51],k[51]); ROUND(e,f,g,h,a,b,c,d,w[52],k[52]); ROUND(d,e,f,g,h,a,b,c,w[53],k[53]); ROUND(c,d,e,f,g,h,a,b,w[54],k[54]); ROUND(b,c,d,e,f,g,h,a,w[55],k[55]); ROUND(a,b,c,d,e,f,g,h,w[56],k[56]); ROUND(h,a,b,c,d,e,f,g,w[57],k[57]); ROUND(g,h,a,b,c,d,e,f,w[58],k[58]); ROUND(f,g,h,a,b,c,d,e,w[59],k[59]); ROUND(e,f,g,h,a,b,c,d,w[60],k[60]); ROUND(d,e,f,g,h,a,b,c,w[61],k[61]); ROUND(c,d,e,f,g,h,a,b,w[62],k[62]); ROUND(b,c,d,e,f,g,h,a,w[63],k[63]); ROUND(a,b,c,d,e,f,g,h,w[64],k[64]); ROUND(h,a,b,c,d,e,f,g,w[65],k[65]); ROUND(g,h,a,b,c,d,e,f,w[66],k[66]); ROUND(f,g,h,a,b,c,d,e,w[67],k[67]); ROUND(e,f,g,h,a,b,c,d,w[68],k[68]); ROUND(d,e,f,g,h,a,b,c,w[69],k[69]); ROUND(c,d,e,f,g,h,a,b,w[70],k[70]); ROUND(b,c,d,e,f,g,h,a,w[71],k[71]); ROUND(a,b,c,d,e,f,g,h,w[72],k[72]); ROUND(h,a,b,c,d,e,f,g,w[73],k[73]); ROUND(g,h,a,b,c,d,e,f,w[74],k[74]); ROUND(f,g,h,a,b,c,d,e,w[75],k[75]); ROUND(e,f,g,h,a,b,c,d,w[76],k[76]); ROUND(d,e,f,g,h,a,b,c,w[77],k[77]); ROUND(c,d,e,f,g,h,a,b,w[78],k[78]); ROUND(b,c,d,e,f,g,h,a,w[79],k[79]); #ifdef OPTIMIZE_SSE2 w = (__m64*) sp->h; w[0] = _mm_add_si64(w[0], a); w[1] = _mm_add_si64(w[1], b); w[2] = _mm_add_si64(w[2], c); w[3] = _mm_add_si64(w[3], d); w[4] = _mm_add_si64(w[4], e); w[5] = _mm_add_si64(w[5], f); w[6] = _mm_add_si64(w[6], g); w[7] = _mm_add_si64(w[7], h); _mm_empty(); #else sp->h[0] += a; sp->h[1] += b; sp->h[2] += c; sp->h[3] += d; sp->h[4] += e; sp->h[5] += f; sp->h[6] += g; sp->h[7] += h; #endif }
void mlib_m_ImageMaximum_U8_3( mlib_s32 *res32, const mlib_image *img) { /* src address */ __m64 *sp, *sl; /* src data */ __m64 sd; /* max values */ __m64 max0, max1, max2, max3; /* edge mask */ mlib_s32 emask; /* loop variables */ mlib_s32 n1; /* height of image */ mlib_s32 height = mlib_ImageGetHeight(img); /* elements to next row */ mlib_s32 slb = mlib_ImageGetStride(img); mlib_s32 width = mlib_ImageGetWidth(img) * 3; mlib_u8 *dend; if (slb == width) { width *= height; height = 1; } sp = sl = (__m64 *) mlib_ImageGetData(img); max1 = _mm_set1_pi8(MLIB_U8_MIN); max2 = _mm_set1_pi8(MLIB_U8_MIN); max3 = _mm_set1_pi8(MLIB_U8_MIN); for (; height > 0; height--) { n1 = width; dend = (mlib_u8 *)sp + width; for (; n1 > 23; n1 -= 24) { sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8(max1, max1, sd); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8(max2, max2, sd); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8(max3, max3, sd); } if (n1 > 0) { emask = (n1 > 7) ? 0xFF : (0xFF << (8 - n1)); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8_M32(max1, max1, sd, emask); n1 = ((mlib_u8 *)dend - (mlib_u8 *)sp); if (n1 > 0) { emask = (n1 > 7) ? 0xFF : (0xFF << (8 - n1)); sd = (*sp++); MLIB_M_IMAGE_MAXIMUM_U8_M32(max2, max2, sd, emask); n1 = ((mlib_u8 *)dend - (mlib_u8 *)sp); if (n1 > 0) { emask = (0xFF << (8 - n1)); sd = *sp; MLIB_M_IMAGE_MAXIMUM_U8_M32(max3, max3, sd, emask); } } } sp = sl = (__m64 *) ((mlib_u8 *)sl + slb); } MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max1, _mm_srli_si64(max2, 8), mmx_write_64(0x00ffffffffffffffll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_slli_si64(max2, 16), mmx_write_64(0x0000000000ff0000ll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max3, 16), mmx_write_64(0x0000ffffffffffffll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_slli_si64(max3, 8), mmx_write_64(0x0000000000ffff00ll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max0, 24), mmx_write_64(0x000000ffff000000ll)); MLIB_M_IMAGE_MAXIMUM_U8_M64(max0, max0, _mm_srli_si64(max0, 24), mmx_write_64(0x0000000000ffffffll)); res32[0] = _mm_cvtsi64_si32(_mm_and_si64(max0, mmx_write_64(0x00000000000000ffll))); res32[1] = _mm_cvtsi64_si32(_mm_and_si64(_mm_srli_si64(max0, 8), mmx_write_64(0x00000000000000ffll))); res32[2] = _mm_cvtsi64_si32(_mm_and_si64(_mm_srli_si64(max0, 16), mmx_write_64(0x00000000000000ffll))); _mm_empty(); }
__m64 test_mm_srli_si64(__m64 a) { // CHECK-LABEL: test_mm_srli_si64 // CHECK: call x86_mmx @llvm.x86.mmx.psrli.q return _mm_srli_si64(a, 3); }
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix) { const unsigned char *u = src + ((y-1) & 7)*sc2lines_width, *m = src + ((y+0) & 7)*sc2lines_width, *l = src + ((y+1) & 7)*sc2lines_width; for (unsigned i = 0; i < nPix; i += 4) { if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) { __m64 mm = *(__m64*)(m+i-2); __m64 uu = *(__m64*)(u+i-2); __m64 ll = *(__m64*)(l+i-2); __m64 md = _mm_slli_si64(mm,8); __m64 mf = _mm_srli_si64(mm,8); __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll)); __m64 e0, e1, v1, v2; e0 = _mm_cmpeq_pi8(md,uu); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,uu); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(u+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); e0 = _mm_cmpeq_pi8(md,ll); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,ll); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(l+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); } else { __m64 v1 = _m_from_int(*(unsigned*)(m+i)); v1 = _mm_unpacklo_pi8(v1,v1); *(__m64*)(dst1 + 2*i) = v1; *(__m64*)(dst2 + 2*i) = v1; } } }