Exemple #1
0
long dotp(short a[], short b[])
{
  int i;
  __m64 mm0, mm1, mm2, mm3, mm4;
  short suml[4];   // don't init sum from C - this confuses the GCC!
  short sumh[4];
  
  /* mmx - Intel Pentium-MMX and above */

  mm2 = _m_psubw(mm2, mm2);   // set mm2 to 0
  mm4 = _m_psubw(mm4, mm4);
  for (i = 0; i < NLMS_LEN; i += 4, a += 4, b += 4) {
    mm0 = _m_from_WORDs(a);
    mm3 = mm0;
    mm1 = _m_from_WORDs(b);
    
    /* Intel notation: first operand is destination */
    /* GNU as notation: first operand is source */
    // mm0 = _mm_mullo_pi16 (mm0, mm1);
    mm3 = _mm_mulhi_pi16 (mm3, mm1);
    // mm2 = _mm_add_pi16(mm2, mm0);
    mm4 = _mm_add_pi16(mm4, mm3);
  }
  _m_from_WORDs(suml) = mm2;
  _m_from_WORDs(sumh) = mm4;
  _mm_empty();
  return suml[0] + suml[1] + suml[2] + suml[3] 
   + 65536 * (sumh[0] + sumh[1] + sumh[2] + sumh[3]);
}
int f(unsigned short n)
{
    __m64 vec = (__m64)(v4hi) {
        0, 0, 1, n
    };
    __m64 hw = _mm_mulhi_pi16 (vec, vec);
    return _mm_extract_pi16 (hw, 0);
}
Exemple #3
0
__m64 test50(__m64 a, __m64 b) {
  // CHECK: pmulhw
  return _mm_mulhi_pi16(a, b);
}
Exemple #4
0
void reverb::comb_allpass4(signed short *sp,
														signed short *dp,
														const comb_param &comb_delay,
														const int comb_gain,
														const int allpass_delay,
														const int allpass_gain,
														const int *rvol,
														const unsigned int sz)
{
#ifdef use_intrinsics
	__m64   cg=_mm_set1_pi16(comb_gain),
				ag=_mm_set1_pi16(allpass_gain),
				rv[2];
	rv[0]=_mm_set1_pi16(rvol[0]);
	rv[1]=_mm_set1_pi16(rvol[1]);

	for (unsigned int i=0; i<(sz>>4); i++, sp+=2<<2, dp+=2<<2)
	{
		__m64 dv[2];

		for (int c=0; c<2; c++)
		{
			// Comb

			__m64 v=_mm_setzero_si64();

			for (int f=0; f<4; f++)
			{
				int yck=(yp-comb_delay[c][f])&(max_delay-1);
				__m64 xv=*(__m64 *)(&x[c][yck]),
							yv=*(__m64 *)(&y[c][f][yck]);
				yv=_mm_mulhi_pi16(yv,cg);
				yv=_mm_adds_pi16(yv,yv);
				yv=_mm_adds_pi16(xv,yv);
				*((__m64 *)&y[c][f][yp])=yv;
				yv=_mm_srai_pi16(yv,2);
				v=_mm_adds_pi16(v,yv);
			}

			// Allpass

			if (allpass_delay)
			{
				*((__m64 *)&ax[c][yp])=v;

				int ypa=(yp-allpass_delay)&(max_delay-1);
				__m64 ayv=*(__m64 *)&ay[c][ypa],
								xv=*(__m64 *)&x[c][yp],
								axv=*(__m64 *)&ax[c][ypa];

				ayv=_mm_subs_pi16(ayv,xv);
				ayv=_mm_mulhi_pi16(ayv,ag);
				ayv=_mm_adds_pi16(ayv,ayv);
				v=_mm_adds_pi16(ayv,axv);
				*((__m64 *)&ay[c][yp])=v;
			}

			// Output

			dv[c]=_mm_mulhi_pi16(v,rv[c]);
			dv[c]=_mm_adds_pi16(dv[c],dv[c]);
		}

		__m64 dv1=_mm_unpacklo_pi16(dv[0],dv[1]),
					dv2=_mm_unpackhi_pi16(dv[0],dv[1]),
					d1=*(__m64 *)&dp[0],
					d2=*(__m64 *)&dp[4],
					s1=*(__m64 *)&sp[0],
					s2=*(__m64 *)&sp[4];
		d1=_mm_adds_pi16(d1,s1);
		d2=_mm_adds_pi16(d2,s2);
		d1=_mm_adds_pi16(d1,dv1);
		d2=_mm_adds_pi16(d2,dv2);
		*(__m64 *)&dp[0]=d1;
		*(__m64 *)&dp[4]=d2;

		yp=(yp+4)&(max_delay-1);
	}

	_mm_empty();
#endif
}
__m64 test_mm_mulhi_pi16(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_mulhi_pi16
  // CHECK: call x86_mmx @llvm.x86.mmx.pmulh.w
  return _mm_mulhi_pi16(a, b);
}