int32_t od_mc_compute_satd8_4x4_sse2(const unsigned char *src, int systride, const unsigned char *ref, int rystride) { int32_t satd; __m64 sums; __m64 a; __m64 b; __m64 c; __m64 d; a = od_load_convert_subtract_x4(src + 0*systride, ref + 0*rystride); b = od_load_convert_subtract_x4(src + 1*systride, ref + 1*rystride); c = od_load_convert_subtract_x4(src + 2*systride, ref + 2*rystride); d = od_load_convert_subtract_x4(src + 3*systride, ref + 3*rystride); /*Vertical 1D transform.*/ od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); od_transpose16x4(&a, &b, &c, &d); /*Horizontal 1D transform.*/ od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); /*Use the fact that (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) to merge the final butterfly stage with the calculating the absolute values and the first stage of accumulation. Calculates (abs(a+b)+abs(a-b))/2-0x7FFF. An offset must be added to the final sum before rounding to account for subtracting 0x7FFF.*/ a = _mm_sub_pi16(_mm_max_pi16(a, b), _mm_adds_pi16(_mm_add_pi16(a, b), _mm_set1_pi16(0x7FFF))); c = _mm_sub_pi16(_mm_max_pi16(c, d), _mm_adds_pi16(_mm_add_pi16(c, d), _mm_set1_pi16(0x7FFF))); /*Take the sum of all the absolute values.*/ sums = _mm_add_pi16(a, c); /*Sum the elements of the vector.*/ sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(0, 1, 2, 3))); sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(2, 3, 0, 1))); sums = _mm_unpacklo_pi16(sums, _mm_setzero_si64()); satd = _mm_cvtsi64_si32(sums); /*Subtract the offset (8) and round.*/ satd = (satd + 1 - 8) >> 1; #if defined(OD_CHECKASM) { int32_t c_satd; c_satd = od_mc_compute_satd8_4x4_c(src, systride, ref, rystride); if (satd != c_satd) { fprintf(stderr, "od_mc_compute_satd %ix%i check failed: %i!=%i\n", 4, 4, satd, c_satd); } } #endif return satd; }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YV12 to RGB16. VOID Yv12ToRgb16_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcYDif; INT iSrcUvDif; INT yy, bu, guv, rv, r, g, b; M64 y0, y1, u0, v0, mz; M64 r0, g0, b0, r1, g1, b1; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbDstX += (iHeight - 1) * iDstXStride; iDstXStride = -iDstXStride; } iDstXDif = iDstXStride - (uWidth * 2); iSrcYDif = iSrcYStride - uWidth; iSrcUvDif = iSrcUvStride - (uWidth / 2); mz = _mm_setzero_si64(); for (y = iHeight / 2; y; y--) { for (x = uWidth / 8; x; x--) { // Calculate coefficient. u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU)); // [ | u3 u2 u1 u0] v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV)); // [ | v3 v2 v1 v0] u0 = _mm_unpacklo_pi8(u0, mz); // u3 u2 u1 u0 v0 = _mm_unpacklo_pi8(v0, mz); // v3 v2 v1 v0 u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y0 = *((PM64) pbSrcY); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); // 8 bits (0 - 7) b0 = _mm_srai_pi16(b0, SCALEBITS); b1 = _mm_packs_pu16(b1, mz); // 0 0 0 0 b7 b6 b5 b4 b0 = _mm_packs_pu16(b0, mz); // 0 0 0 0 b3 b2 b1 b0 b1 = _mm_unpacklo_pi8(b1, mz); // 0 b7 0b 6 0 b5 0 b4 b0 = _mm_unpacklo_pi8(b0, mz); b1 = _mm_srli_pi16(b1, 3); b0 = _mm_srli_pi16(b0, 3); // 5 bits (0 - 4) g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); // 8 bits (0 - 7) g0 = _mm_srai_pi16(g0, SCALEBITS); g1 = _mm_packs_pu16(g1, mz); // 0 0 0 0 g7 g6 g5 g4 g0 = _mm_packs_pu16(g0, mz); // 0 0 0 0 g3 g2 g1 g0 g1 = _mm_unpacklo_pi8(g1, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); g1 = _mm_srli_pi16(g1, 2); // 6 bits (0 - 5) g0 = _mm_srli_pi16(g0, 2); g1 = _mm_slli_pi16(g1, 5); // 6 bits (5 - 10) g0 = _mm_slli_pi16(g0, 5); // 6 bits (5 - 10) r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r1 = _mm_packs_pu16(r1, mz); // 0 0 0 0 r7 r6 r5 r4 r0 = _mm_packs_pu16(r0, mz); // 0 0 0 0 r3 r2 r1 r0 r1 = _mm_unpacklo_pi8(r1, mz); // 0 r7 0 r6 0 r5 0 r4 r0 = _mm_unpacklo_pi8(r0, mz); r1 = _mm_srli_pi16(r1, 3); // 5 bits (0 - 4) r0 = _mm_srli_pi16(r0, 3); r1 = _mm_slli_pi16(r1, 11); // 5 bits (11 - 15) r0 = _mm_slli_pi16(r0, 11); // 5 bits (11 - 15) // Combine RGB. b0 = _mm_or_si64(g0, b0); b0 = _mm_or_si64(r0, b0); // 16 bits rgb b1 = _mm_or_si64(g1, b1); b1 = _mm_or_si64(r1, b1); // 16 bits rgb // Write out row 0. ((PM64) pbDstX)[0] = b0; ((PM64) pbDstX)[1] = b1; // Process for row 1. y0 = *((PM64) (pbSrcY + iSrcYStride)); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); // 8 bits (0 - 7) b0 = _mm_srai_pi16(b0, SCALEBITS); b1 = _mm_packs_pu16(b1, mz); // 0 0 0 0 b7 b6 b5 b4 b0 = _mm_packs_pu16(b0, mz); // 0 0 0 0 b3 b2 b1 b0 b1 = _mm_unpacklo_pi8(b1, mz); // 0 b7 0b 6 0 b5 0 b4 b0 = _mm_unpacklo_pi8(b0, mz); b1 = _mm_srli_pi16(b1, 3); b0 = _mm_srli_pi16(b0, 3); // 5 bits (0 - 4) g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); // 8 bits (0 - 7) g0 = _mm_srai_pi16(g0, SCALEBITS); g1 = _mm_packs_pu16(g1, mz); // 0 0 0 0 g7 g6 g5 g4 g0 = _mm_packs_pu16(g0, mz); // 0 0 0 0 g3 g2 g1 g0 g1 = _mm_unpacklo_pi8(g1, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); g1 = _mm_srli_pi16(g1, 2); // 6 bits (0 - 5) g0 = _mm_srli_pi16(g0, 2); g1 = _mm_slli_pi16(g1, 5); // 6 bits (5 - 10) g0 = _mm_slli_pi16(g0, 5); // 6 bits (5 - 10) r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r1 = _mm_packs_pu16(r1, mz); // 0 0 0 0 r7 r6 r5 r4 r0 = _mm_packs_pu16(r0, mz); // 0 0 0 0 r3 r2 r1 r0 r1 = _mm_unpacklo_pi8(r1, mz); // 0 r7 0 r6 0 r5 0 r4 r0 = _mm_unpacklo_pi8(r0, mz); r1 = _mm_srli_pi16(r1, 3); // 5 bits (0 - 4) r0 = _mm_srli_pi16(r0, 3); r1 = _mm_slli_pi16(r1, 11); // 5 bits (11 - 15) r0 = _mm_slli_pi16(r0, 11); // 5 bits (11 - 15) // Combine RGB. b0 = _mm_or_si64(g0, b0); b0 = _mm_or_si64(r0, b0); // 16 bits rgb b1 = _mm_or_si64(g1, b1); b1 = _mm_or_si64(r1, b1); // 16 bits rgb // Write out row 1. ((PM64) (pbDstX + iDstXStride))[0] = b0; ((PM64) (pbDstX + iDstXStride))[1] = b1; pbDstX += 16; pbSrcY += 8; pbSrcU += 4; pbSrcV += 4; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcU[0]]; guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]]; rv = g_iRVTab[pbSrcV[0]]; yy = g_iYYTab[pbSrcY[0]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) pbDstX)[0] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[1]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) pbDstX)[1] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[iSrcYStride]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) (pbDstX + iDstXStride))[0] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[iSrcYStride + 1]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) (pbDstX + iDstXStride))[1] = _MakeRgb16(r, g, b); pbDstX += 4; pbSrcY += 2; pbSrcU++; pbSrcV++; } pbDstX += iDstXDif + iDstXStride; pbSrcY += iSrcYDif + iSrcYStride; pbSrcU += iSrcUvDif; pbSrcV += iSrcUvDif; } _mm_empty(); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YV12 to RGB24. VOID Yv12ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcYDif; INT iSrcUvDif; INT yy, bu, guv, rv; M64 y0, y1, u0, v0, mz; M64 r0, g0, b0, r1, g1, b1; M64 rgb0, rgb1, rgb2, rgb3; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbDstX += (iHeight - 1) * iDstXStride; iDstXStride = -iDstXStride; } iDstXDif = iDstXStride - (uWidth * 3); iSrcYDif = iSrcYStride - uWidth; iSrcUvDif = iSrcUvStride - (uWidth / 2); mz = _mm_setzero_si64(); for (y = iHeight / 2; y; y--) { for (x = uWidth / 8; x; x--) { // Calculate coefficient. u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU)); // [ | u3 u2 u1 u0] v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV)); // [ | v3 v2 v1 v0] u0 = _mm_unpacklo_pi8(u0, mz); // u3 u2 u1 u0 v0 = _mm_unpacklo_pi8(v0, mz); // v3 v2 v1 v0 u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y0 = *((PM64) pbSrcY); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 0. *((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3); // Process for row 1. y0 = *((PM64) (pbSrcY + iSrcYStride)); // [YYYY | YYYY]; row 1 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 1. *((PDWORD) (pbDstX + iDstXStride + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + iDstXStride + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + iDstXStride + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + iDstXStride + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + iDstXStride + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + iDstXStride + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + iDstXStride + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + iDstXStride + 21)) = _mm_cvtsi64_si32(rgb3); pbDstX += 24; pbSrcY += 8; pbSrcU += 4; pbSrcV += 4; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcU[0]]; guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]]; rv = g_iRVTab[pbSrcV[0]]; yy = g_iYYTab[pbSrcY[0]]; pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[1]]; pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[iSrcYStride]]; pbDstX[iDstXStride + 0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[iDstXStride + 1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[iDstXStride + 2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[iSrcYStride + 1]]; pbDstX[iDstXStride + 3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[iDstXStride + 4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[iDstXStride + 5] = _Clip((yy + rv) >> SCALEBITS_OUT); pbDstX += 6; pbSrcY += 2; pbSrcU++; pbSrcV++; } pbDstX += iDstXDif + iDstXStride; pbSrcY += iSrcYDif + iSrcYStride; pbSrcU += iSrcUvDif; pbSrcV += iSrcUvDif; } _mm_empty(); }
void rtv_lucent4cols_MMX(byte *source, argb_t *dest, int bga, int fga) { // SSE2 temporaries: const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff); const __m64 fgAlpha = _mm_set_pi16(0, fga, fga, fga); const __m64 bgAlpha = _mm_set_pi16(0, bga, bga, bga); #if 1 const __m64 bgColors01 = _mm_setr_pi32(dest[0], dest[1]); #else const __m64 bgColors01 = *((__m64 *)&dest[0]); #endif const __m64 fgColors01 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[0]), rt_mapcolor<argb_t>(dcol.colormap, source[1]) ); const __m64 finalColors01 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ) ); #if 1 const __m64 bgColors23 = _mm_setr_pi32(dest[2], dest[3]); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. const __m64 bgColors23 = *((__m64 *)&dest[2]); #endif const __m64 fgColors23 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[2]), rt_mapcolor<argb_t>(dcol.colormap, source[3]) ); const __m64 finalColors23 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ) ); #if 1 dest[0] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*0)); dest[1] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*1)); dest[2] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*0)); dest[3] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*1)); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. *((__m64 *)&dest[0]) = finalColors01; *((__m64 *)&dest[2]) = finalColors23; #endif // Required to reset FP: _mm_empty(); }
__m64 test38(__m64 a, __m64 b) { // CHECK: paddsw return _mm_adds_pi16(a, b); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YUY2 to RGB24. VOID Yuy2ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcXDif; INT yy, bu, guv, rv; M64 y0, y1, u0, v0, uv_temp1, uv_temp2, mz; M64 r0, g0, b0, r1, g1, b1; M64 rgb0, rgb1, rgb2, rgb3; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbSrcX += (iHeight - 1) * iSrcXStride; iSrcXStride = -iSrcXStride; } iDstXDif = iDstXStride - (uWidth * 3); iSrcXDif = iSrcXStride - (uWidth * 2); mz = _mm_setzero_si64(); for (y = iHeight; y; y--) { for (x = uWidth / 8; x; x--) { y0 = ((PM64) pbSrcX)[0]; y1 = ((PM64) pbSrcX)[1]; u0 = y0; v0 = y1; y0 = _mm_and_si64(y0, g_mWord00FF); y1 = _mm_and_si64(y1, g_mWord00FF); u0 = _mm_srli_pi16(u0, 8); v0 = _mm_srli_pi16(v0, 8); uv_temp1 = _mm_srli_pi32(u0, 16); u0 = _mm_slli_pi32(u0, 16); u0 = _mm_srli_pi32(u0, 16); uv_temp2 = _mm_srli_pi32(v0, 16); v0 = _mm_slli_pi32(v0, 16); v0 = _mm_srli_pi32(v0, 16); u0 = _mm_packs_pi32(u0, v0); v0 = _mm_packs_pi32(uv_temp1, uv_temp2); // Calculate coefficient. u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 0. *((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3); pbDstX += 24; pbSrcX += 16; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcX[1]]; guv = g_iGUTab[pbSrcX[1]] + g_iGVTab[pbSrcX[3]]; rv = g_iRVTab[pbSrcX[3]]; yy = g_iYYTab[pbSrcX[0]]; pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcX[2]]; pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT); pbDstX += 6; pbSrcX += 4; } pbDstX += iDstXDif; pbSrcX += iSrcXDif; } _mm_empty(); }
void AudioMixer::addBufferToMixForListeningNodeWithBuffer(PositionalAudioRingBuffer* bufferToAdd, AvatarAudioRingBuffer* listeningNodeBuffer) { float bearingRelativeAngleToSource = 0.0f; float attenuationCoefficient = 1.0f; int numSamplesDelay = 0; float weakChannelAmplitudeRatio = 1.0f; if (bufferToAdd != listeningNodeBuffer) { // if the two buffer pointers do not match then these are different buffers glm::vec3 relativePosition = bufferToAdd->getPosition() - listeningNodeBuffer->getPosition(); glm::quat inverseOrientation = glm::inverse(listeningNodeBuffer->getOrientation()); float distanceSquareToSource = glm::dot(relativePosition, relativePosition); float radius = 0.0f; if (bufferToAdd->getType() == PositionalAudioRingBuffer::Injector) { InjectedAudioRingBuffer* injectedBuffer = (InjectedAudioRingBuffer*) bufferToAdd; radius = injectedBuffer->getRadius(); attenuationCoefficient *= injectedBuffer->getAttenuationRatio(); } if (radius == 0 || (distanceSquareToSource > radius * radius)) { // this is either not a spherical source, or the listener is outside the sphere if (radius > 0) { // this is a spherical source - the distance used for the coefficient // needs to be the closest point on the boundary to the source // ovveride the distance to the node with the distance to the point on the // boundary of the sphere distanceSquareToSource -= (radius * radius); } else { // calculate the angle delivery for off-axis attenuation glm::vec3 rotatedListenerPosition = glm::inverse(bufferToAdd->getOrientation()) * relativePosition; float angleOfDelivery = glm::angle(glm::vec3(0.0f, 0.0f, -1.0f), glm::normalize(rotatedListenerPosition)); const float MAX_OFF_AXIS_ATTENUATION = 0.2f; const float OFF_AXIS_ATTENUATION_FORMULA_STEP = (1 - MAX_OFF_AXIS_ATTENUATION) / 2.0f; float offAxisCoefficient = MAX_OFF_AXIS_ATTENUATION + (OFF_AXIS_ATTENUATION_FORMULA_STEP * (angleOfDelivery / PI_OVER_TWO)); // multiply the current attenuation coefficient by the calculated off axis coefficient attenuationCoefficient *= offAxisCoefficient; } glm::vec3 rotatedSourcePosition = inverseOrientation * relativePosition; const float DISTANCE_SCALE = 2.5f; const float GEOMETRIC_AMPLITUDE_SCALAR = 0.3f; const float DISTANCE_LOG_BASE = 2.5f; const float DISTANCE_SCALE_LOG = logf(DISTANCE_SCALE) / logf(DISTANCE_LOG_BASE); // calculate the distance coefficient using the distance to this node float distanceCoefficient = powf(GEOMETRIC_AMPLITUDE_SCALAR, DISTANCE_SCALE_LOG + (0.5f * logf(distanceSquareToSource) / logf(DISTANCE_LOG_BASE)) - 1); distanceCoefficient = std::min(1.0f, distanceCoefficient); // multiply the current attenuation coefficient by the distance coefficient attenuationCoefficient *= distanceCoefficient; // project the rotated source position vector onto the XZ plane rotatedSourcePosition.y = 0.0f; // produce an oriented angle about the y-axis bearingRelativeAngleToSource = glm::orientedAngle(glm::vec3(0.0f, 0.0f, -1.0f), glm::normalize(rotatedSourcePosition), glm::vec3(0.0f, 1.0f, 0.0f)); const float PHASE_AMPLITUDE_RATIO_AT_90 = 0.5; // figure out the number of samples of delay and the ratio of the amplitude // in the weak channel for audio spatialization float sinRatio = fabsf(sinf(bearingRelativeAngleToSource)); numSamplesDelay = SAMPLE_PHASE_DELAY_AT_90 * sinRatio; weakChannelAmplitudeRatio = 1 - (PHASE_AMPLITUDE_RATIO_AT_90 * sinRatio); } } // if the bearing relative angle to source is > 0 then the delayed channel is the right one int delayedChannelOffset = (bearingRelativeAngleToSource > 0.0f) ? 1 : 0; int goodChannelOffset = delayedChannelOffset == 0 ? 1 : 0; const int16_t* nextOutputStart = bufferToAdd->getNextOutput(); const int16_t* bufferStart = bufferToAdd->getBuffer(); int ringBufferSampleCapacity = bufferToAdd->getSampleCapacity(); int16_t correctBufferSample[2], delayBufferSample[2]; int delayedChannelIndex = 0; const int SINGLE_STEREO_OFFSET = 2; for (int s = 0; s < NETWORK_BUFFER_LENGTH_SAMPLES_STEREO; s += 4) { // setup the int16_t variables for the two sample sets correctBufferSample[0] = nextOutputStart[s / 2] * attenuationCoefficient; correctBufferSample[1] = nextOutputStart[(s / 2) + 1] * attenuationCoefficient; delayedChannelIndex = s + (numSamplesDelay * 2) + delayedChannelOffset; delayBufferSample[0] = correctBufferSample[0] * weakChannelAmplitudeRatio; delayBufferSample[1] = correctBufferSample[1] * weakChannelAmplitudeRatio; __m64 bufferSamples = _mm_set_pi16(_clientSamples[s + goodChannelOffset], _clientSamples[s + goodChannelOffset + SINGLE_STEREO_OFFSET], _clientSamples[delayedChannelIndex], _clientSamples[delayedChannelIndex + SINGLE_STEREO_OFFSET]); __m64 addedSamples = _mm_set_pi16(correctBufferSample[0], correctBufferSample[1], delayBufferSample[0], delayBufferSample[1]); // perform the MMX add (with saturation) of two correct and delayed samples __m64 mmxResult = _mm_adds_pi16(bufferSamples, addedSamples); int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult); // assign the results from the result of the mmx arithmetic _clientSamples[s + goodChannelOffset] = shortResults[3]; _clientSamples[s + goodChannelOffset + SINGLE_STEREO_OFFSET] = shortResults[2]; _clientSamples[delayedChannelIndex] = shortResults[1]; _clientSamples[delayedChannelIndex + SINGLE_STEREO_OFFSET] = shortResults[0]; } // The following code is pretty gross and redundant, but AFAIK it's the best way to avoid // too many conditionals in handling the delay samples at the beginning of _clientSamples. // Basically we try to take the samples in batches of four, and then handle the remainder // conditionally to get rid of the rest. const int DOUBLE_STEREO_OFFSET = 4; const int TRIPLE_STEREO_OFFSET = 6; if (numSamplesDelay > 0) { // if there was a sample delay for this buffer, we need to pull samples prior to the nextOutput // to stick at the beginning float attenuationAndWeakChannelRatio = attenuationCoefficient * weakChannelAmplitudeRatio; const int16_t* delayNextOutputStart = nextOutputStart - numSamplesDelay; if (delayNextOutputStart < bufferStart) { delayNextOutputStart = bufferStart + ringBufferSampleCapacity - numSamplesDelay; } int i = 0; while (i + 3 < numSamplesDelay) { // handle the first cases where we can MMX add four samples at once int parentIndex = i * 2; __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset], _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset], _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset], _clientSamples[parentIndex + TRIPLE_STEREO_OFFSET + delayedChannelOffset]); __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 2] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 3] * attenuationAndWeakChannelRatio); __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples); int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult); _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3]; _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2]; _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[1]; _clientSamples[parentIndex + TRIPLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[0]; // push the index i += 4; } int parentIndex = i * 2; if (i + 2 < numSamplesDelay) { // MMX add only three delayed samples __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset], _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset], _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset], 0); __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 2] * attenuationAndWeakChannelRatio, 0); __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples); int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult); _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3]; _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2]; _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[1]; } else if (i + 1 < numSamplesDelay) { // MMX add two delayed samples __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset], _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset], 0, 0); __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio, 0, 0); __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples); int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult); _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3]; _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2]; } else if (i < numSamplesDelay) { // MMX add a single delayed sample __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset], 0, 0, 0); __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio, 0, 0, 0); __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples); int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult); _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3]; } } }
void reverb::comb_allpass4(signed short *sp, signed short *dp, const comb_param &comb_delay, const int comb_gain, const int allpass_delay, const int allpass_gain, const int *rvol, const unsigned int sz) { #ifdef use_intrinsics __m64 cg=_mm_set1_pi16(comb_gain), ag=_mm_set1_pi16(allpass_gain), rv[2]; rv[0]=_mm_set1_pi16(rvol[0]); rv[1]=_mm_set1_pi16(rvol[1]); for (unsigned int i=0; i<(sz>>4); i++, sp+=2<<2, dp+=2<<2) { __m64 dv[2]; for (int c=0; c<2; c++) { // Comb __m64 v=_mm_setzero_si64(); for (int f=0; f<4; f++) { int yck=(yp-comb_delay[c][f])&(max_delay-1); __m64 xv=*(__m64 *)(&x[c][yck]), yv=*(__m64 *)(&y[c][f][yck]); yv=_mm_mulhi_pi16(yv,cg); yv=_mm_adds_pi16(yv,yv); yv=_mm_adds_pi16(xv,yv); *((__m64 *)&y[c][f][yp])=yv; yv=_mm_srai_pi16(yv,2); v=_mm_adds_pi16(v,yv); } // Allpass if (allpass_delay) { *((__m64 *)&ax[c][yp])=v; int ypa=(yp-allpass_delay)&(max_delay-1); __m64 ayv=*(__m64 *)&ay[c][ypa], xv=*(__m64 *)&x[c][yp], axv=*(__m64 *)&ax[c][ypa]; ayv=_mm_subs_pi16(ayv,xv); ayv=_mm_mulhi_pi16(ayv,ag); ayv=_mm_adds_pi16(ayv,ayv); v=_mm_adds_pi16(ayv,axv); *((__m64 *)&ay[c][yp])=v; } // Output dv[c]=_mm_mulhi_pi16(v,rv[c]); dv[c]=_mm_adds_pi16(dv[c],dv[c]); } __m64 dv1=_mm_unpacklo_pi16(dv[0],dv[1]), dv2=_mm_unpackhi_pi16(dv[0],dv[1]), d1=*(__m64 *)&dp[0], d2=*(__m64 *)&dp[4], s1=*(__m64 *)&sp[0], s2=*(__m64 *)&sp[4]; d1=_mm_adds_pi16(d1,s1); d2=_mm_adds_pi16(d2,s2); d1=_mm_adds_pi16(d1,dv1); d2=_mm_adds_pi16(d2,dv2); *(__m64 *)&dp[0]=d1; *(__m64 *)&dp[4]=d2; yp=(yp+4)&(max_delay-1); } _mm_empty(); #endif }
__m64 test_mm_adds_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_adds_pi16 // CHECK: call x86_mmx @llvm.x86.mmx.padds.w return _mm_adds_pi16(a, b); }