long dotp(short a[], short b[]) { int i; __m64 mm0, mm1, mm2, mm3, mm4; short suml[4]; // don't init sum from C - this confuses the GCC! short sumh[4]; /* mmx - Intel Pentium-MMX and above */ mm2 = _m_psubw(mm2, mm2); // set mm2 to 0 mm4 = _m_psubw(mm4, mm4); for (i = 0; i < NLMS_LEN; i += 4, a += 4, b += 4) { mm0 = _m_from_WORDs(a); mm3 = mm0; mm1 = _m_from_WORDs(b); /* Intel notation: first operand is destination */ /* GNU as notation: first operand is source */ // mm0 = _mm_mullo_pi16 (mm0, mm1); mm3 = _mm_mulhi_pi16 (mm3, mm1); // mm2 = _mm_add_pi16(mm2, mm0); mm4 = _mm_add_pi16(mm4, mm3); } _m_from_WORDs(suml) = mm2; _m_from_WORDs(sumh) = mm4; _mm_empty(); return suml[0] + suml[1] + suml[2] + suml[3] + 65536 * (sumh[0] + sumh[1] + sumh[2] + sumh[3]); }
int f(unsigned short n) { __m64 vec = (__m64)(v4hi) { 0, 0, 1, n }; __m64 hw = _mm_mulhi_pi16 (vec, vec); return _mm_extract_pi16 (hw, 0); }
__m64 test50(__m64 a, __m64 b) { // CHECK: pmulhw return _mm_mulhi_pi16(a, b); }
void reverb::comb_allpass4(signed short *sp, signed short *dp, const comb_param &comb_delay, const int comb_gain, const int allpass_delay, const int allpass_gain, const int *rvol, const unsigned int sz) { #ifdef use_intrinsics __m64 cg=_mm_set1_pi16(comb_gain), ag=_mm_set1_pi16(allpass_gain), rv[2]; rv[0]=_mm_set1_pi16(rvol[0]); rv[1]=_mm_set1_pi16(rvol[1]); for (unsigned int i=0; i<(sz>>4); i++, sp+=2<<2, dp+=2<<2) { __m64 dv[2]; for (int c=0; c<2; c++) { // Comb __m64 v=_mm_setzero_si64(); for (int f=0; f<4; f++) { int yck=(yp-comb_delay[c][f])&(max_delay-1); __m64 xv=*(__m64 *)(&x[c][yck]), yv=*(__m64 *)(&y[c][f][yck]); yv=_mm_mulhi_pi16(yv,cg); yv=_mm_adds_pi16(yv,yv); yv=_mm_adds_pi16(xv,yv); *((__m64 *)&y[c][f][yp])=yv; yv=_mm_srai_pi16(yv,2); v=_mm_adds_pi16(v,yv); } // Allpass if (allpass_delay) { *((__m64 *)&ax[c][yp])=v; int ypa=(yp-allpass_delay)&(max_delay-1); __m64 ayv=*(__m64 *)&ay[c][ypa], xv=*(__m64 *)&x[c][yp], axv=*(__m64 *)&ax[c][ypa]; ayv=_mm_subs_pi16(ayv,xv); ayv=_mm_mulhi_pi16(ayv,ag); ayv=_mm_adds_pi16(ayv,ayv); v=_mm_adds_pi16(ayv,axv); *((__m64 *)&ay[c][yp])=v; } // Output dv[c]=_mm_mulhi_pi16(v,rv[c]); dv[c]=_mm_adds_pi16(dv[c],dv[c]); } __m64 dv1=_mm_unpacklo_pi16(dv[0],dv[1]), dv2=_mm_unpackhi_pi16(dv[0],dv[1]), d1=*(__m64 *)&dp[0], d2=*(__m64 *)&dp[4], s1=*(__m64 *)&sp[0], s2=*(__m64 *)&sp[4]; d1=_mm_adds_pi16(d1,s1); d2=_mm_adds_pi16(d2,s2); d1=_mm_adds_pi16(d1,dv1); d2=_mm_adds_pi16(d2,dv2); *(__m64 *)&dp[0]=d1; *(__m64 *)&dp[4]=d2; yp=(yp+4)&(max_delay-1); } _mm_empty(); #endif }
__m64 test_mm_mulhi_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_mulhi_pi16 // CHECK: call x86_mmx @llvm.x86.mmx.pmulh.w return _mm_mulhi_pi16(a, b); }