int32_t dot_product(int16_t *x, int16_t *y, uint32_t N, //must be a multiple of 8 uint8_t output_shift) { uint32_t n; #if defined(__x86_64__) || defined(__i386__) __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im; __m64 mmtmp7; __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1); int32_t result; x128 = (__m128i*) x; y128 = (__m128i*) y; mmcumul_re = _mm_setzero_si128(); mmcumul_im = _mm_setzero_si128(); for (n=0; n<(N>>2); n++) { //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128); // print_shorts("x",&x128[0]); // print_shorts("y",&y128[0]); // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y) mmtmp1 = _mm_madd_epi16(x128[0],y128[0]); // print_ints("re",&mmtmp1); // mmtmp1 contains real part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift); mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1); // print_ints("re",&mmcumul_re); // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x) mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1)); // print_shorts("y",&mmtmp2); mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i); // print_shorts("y",&mmtmp2); mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2); // print_ints("im",&mmtmp3); // mmtmp3 contains imag part of 4 consecutive outputs (32-bit) // shift and accumulate results mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift); mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3); // print_ints("im",&mmcumul_im); x128++; y128++; } // this gives Re Re Im Im mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im); // print_ints("cumul1",&mmcumul); // this gives Re Im Re Im mmcumul = _mm_hadd_epi32(mmcumul,mmcumul); // print_ints("cumul2",&mmcumul); //mmcumul = _mm_srai_epi32(mmcumul,output_shift); // extract the lower half mmtmp7 = _mm_movepi64_pi64(mmcumul); // print_ints("mmtmp7",&mmtmp7); // pack the result mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7); // print_shorts("mmtmp7",&mmtmp7); // convert back to integer result = _mm_cvtsi64_si32(mmtmp7); _mm_empty(); _m_empty(); return(result); #elif defined(__arm__) int16x4_t *x_128=(int16x4_t*)x; int16x4_t *y_128=(int16x4_t*)y; int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int32x4_t re_cumul,im_cumul; int32x2_t re_cumul2,im_cumul2; int32x4_t shift = vdupq_n_s32(-output_shift); int32x2x2_t result2; int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ; re_cumul = vdupq_n_s32(0); im_cumul = vdupq_n_s32(0); for (n=0; n<(N>>2); n++) { tmp_re = vmull_s16(*x_128++, *y_128++); //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] tmp_re1 = vmull_s16(*x_128++, *y_128++); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift)); im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift)); } re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul)); im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul)); re_cumul2 = vpadd_s32(re_cumul2,re_cumul2); im_cumul2 = vpadd_s32(im_cumul2,im_cumul2); result2 = vzip_s32(re_cumul2,im_cumul2); return(vget_lane_s32(result2.val[0],0)); #endif }
void weighted_merge_planar_mmx(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height, int weight, int invweight) { __m64 round_mask = _mm_set1_pi32(0x4000); __m64 zero = _mm_setzero_si64(); __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight); int wMod8 = (width/8) * 8; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod8; x += 8) { __m64 px1 = *(reinterpret_cast<const __m64*>(p1+x)); //y7y6 y5y4 y3y2 y1y0 __m64 px2 = *(reinterpret_cast<const __m64*>(p2+x)); //Y7Y6 Y5Y4 Y3Y2 Y1Y0 __m64 p0123 = _mm_unpacklo_pi8(px1, px2); //Y3y3 Y2y2 Y1y1 Y0y0 __m64 p4567 = _mm_unpackhi_pi8(px1, px2); //Y7y7 Y6y6 Y5y5 Y4y4 __m64 p01 = _mm_unpacklo_pi8(p0123, zero); //00Y1 00y1 00Y0 00y0 __m64 p23 = _mm_unpackhi_pi8(p0123, zero); //00Y3 00y3 00Y2 00y2 __m64 p45 = _mm_unpacklo_pi8(p4567, zero); //00Y5 00y5 00Y4 00y4 __m64 p67 = _mm_unpackhi_pi8(p4567, zero); //00Y7 00y7 00Y6 00y6 p01 = _mm_madd_pi16(p01, mask); p23 = _mm_madd_pi16(p23, mask); p45 = _mm_madd_pi16(p45, mask); p67 = _mm_madd_pi16(p67, mask); p01 = _mm_add_pi32(p01, round_mask); p23 = _mm_add_pi32(p23, round_mask); p45 = _mm_add_pi32(p45, round_mask); p67 = _mm_add_pi32(p67, round_mask); p01 = _mm_srli_pi32(p01, 15); p23 = _mm_srli_pi32(p23, 15); p45 = _mm_srli_pi32(p45, 15); p67 = _mm_srli_pi32(p67, 15); p0123 = _mm_packs_pi32(p01, p23); p4567 = _mm_packs_pi32(p45, p67); __m64 result = _mm_packs_pu16(p0123, p4567); *reinterpret_cast<__m64*>(p1+x) = result; } for (int x = wMod8; x < width; x++) { p1[x] = (p1[x]*invweight + p2[x]*weight + 16384) >> 15; } p1 += p1_pitch; p2 += p2_pitch; } _mm_empty(); }
static void weighted_merge_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight) { __m64 round_mask = _mm_set1_pi32(0x4000); __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight); __m64 luma_mask = _mm_set1_pi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m64 chroma_mask = _mm_set1_pi16(0xFF00); #pragma warning(pop) int wMod8 = (width/8) * 8; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod8; x += 8) { __m64 px1 = *reinterpret_cast<const __m64*>(src+x); //V1 Y3 U1 Y2 V0 Y1 U0 Y0 __m64 px2 = *reinterpret_cast<const __m64*>(luma+x); //v1 y3 u1 y2 v0 y1 u0 y0 __m64 src_lo = _mm_unpacklo_pi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0 __m64 src_hi = _mm_unpackhi_pi16(px1, px2); src_lo = _mm_and_si64(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0 src_hi = _mm_and_si64(src_hi, luma_mask); src_lo = _mm_madd_pi16(src_lo, mask); src_hi = _mm_madd_pi16(src_hi, mask); src_lo = _mm_add_pi32(src_lo, round_mask); src_hi = _mm_add_pi32(src_hi, round_mask); src_lo = _mm_srli_pi32(src_lo, 15); src_hi = _mm_srli_pi32(src_hi, 15); __m64 result_luma = _mm_packs_pi32(src_lo, src_hi); __m64 result_chroma = _mm_and_si64(px1, chroma_mask); __m64 result = _mm_or_si64(result_chroma, result_luma); *reinterpret_cast<__m64*>(src+x) = result; } for (int x = wMod8; x < width; x+=2) { src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15; } src += pitch; luma += luma_pitch; } _mm_empty(); }
__m64 test76(__m64 a, __m64 b) { // CHECK: packssdw return _mm_packs_pi32(a, b); }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YUY2 to RGB24. VOID Yuy2ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcXDif; INT yy, bu, guv, rv; M64 y0, y1, u0, v0, uv_temp1, uv_temp2, mz; M64 r0, g0, b0, r1, g1, b1; M64 rgb0, rgb1, rgb2, rgb3; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbSrcX += (iHeight - 1) * iSrcXStride; iSrcXStride = -iSrcXStride; } iDstXDif = iDstXStride - (uWidth * 3); iSrcXDif = iSrcXStride - (uWidth * 2); mz = _mm_setzero_si64(); for (y = iHeight; y; y--) { for (x = uWidth / 8; x; x--) { y0 = ((PM64) pbSrcX)[0]; y1 = ((PM64) pbSrcX)[1]; u0 = y0; v0 = y1; y0 = _mm_and_si64(y0, g_mWord00FF); y1 = _mm_and_si64(y1, g_mWord00FF); u0 = _mm_srli_pi16(u0, 8); v0 = _mm_srli_pi16(v0, 8); uv_temp1 = _mm_srli_pi32(u0, 16); u0 = _mm_slli_pi32(u0, 16); u0 = _mm_srli_pi32(u0, 16); uv_temp2 = _mm_srli_pi32(v0, 16); v0 = _mm_slli_pi32(v0, 16); v0 = _mm_srli_pi32(v0, 16); u0 = _mm_packs_pi32(u0, v0); v0 = _mm_packs_pi32(uv_temp1, uv_temp2); // Calculate coefficient. u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 0. *((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3); pbDstX += 24; pbSrcX += 16; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcX[1]]; guv = g_iGUTab[pbSrcX[1]] + g_iGVTab[pbSrcX[3]]; rv = g_iRVTab[pbSrcX[3]]; yy = g_iYYTab[pbSrcX[0]]; pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcX[2]]; pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT); pbDstX += 6; pbSrcX += 4; } pbDstX += iDstXDif; pbSrcX += iSrcXDif; } _mm_empty(); }
__m64 test_mm_packs_pi32(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_packs_pi32 // CHECK: call x86_mmx @llvm.x86.mmx.packssdw return _mm_packs_pi32(a, b); }