Exemple #1
0
static void
TEST (void)
{
  __m64_union u, s1, s2;
  __m64_union e;
  int i, tmp;

  s1.as_m64 = _mm_set_pi16 (10, 20, 30, 40);
  s2.as_m64 = _mm_set_pi16 (11, 98, 76, 100);
  u.as_m64 = test (s1.as_m64, s2.as_m64);

  for (i = 0; i < 4; i++)
    {
      tmp = (unsigned short)s1.as_short[i] - (unsigned short)s2.as_short[i];

      if (tmp > 65535)
        tmp = -1;

      if (tmp < 0)
        tmp = 0;

      e.as_short[i] = tmp;
    }

  if (u.as_m64 != e.as_m64)
    abort ();
}
//For 0_n
unsigned int interpolvline_mmx_1(unsigned char* image, unsigned char * ref,	int PicWidthInPix){

	__m64 mm_ref = _mm_set_pi16(ref[3],ref[2],ref[1],ref[0]);
	__m64 mm_A = _mm_set_pi16(image[-2 * PicWidthInPix + 3],image[-2 * PicWidthInPix + 2],image[-2 * PicWidthInPix + 1],image[-2 * PicWidthInPix]); 
	__m64 mm_B = _mm_set_pi16(image[-1 * PicWidthInPix + 3],image[-1 * PicWidthInPix + 2],image[-1 * PicWidthInPix + 1],image[-1 * PicWidthInPix]); 
	__m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); 
	__m64 mm_D = _mm_set_pi16(image[1 * PicWidthInPix + 3],image[1 * PicWidthInPix + 2],image[1 * PicWidthInPix + 1],image[1 * PicWidthInPix]);
	__m64 mm_E = _mm_set_pi16(image[2 * PicWidthInPix + 3],image[2 * PicWidthInPix + 2],image[2 * PicWidthInPix + 1],image[2 * PicWidthInPix]);
	__m64 mm_F = _mm_set_pi16(image[3 * PicWidthInPix + 3],image[3 * PicWidthInPix + 2],image[3 * PicWidthInPix + 1],image[3 * PicWidthInPix]);

	__m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F
	__m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16
	__m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E
	__m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D
	__m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2
	__m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E)
	__m64 mm_5 = _mm_set_pi16(5,5,5,5);
	__m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5

	__m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5
	__m64 mm_zero = _mm_setzero_si64();
	__m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0
	__m64 mm_ret = _mm_srai_pi16(mm_clip,5);
	__m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255
	
	
	__m64 test =  _mm_avg_pu8(mm_clip1,mm_ref);//(ptr_img[0] + ptr_rf[0] + 1) >> 1
	__m64 test1 =_mm_packs_pu16(test,mm_zero);
	unsigned int ret = _mm_cvtsi64_si32(test1);

	 empty(); 
	return ret;

}
__m64 interpolhline64_3_mmx(__m64* temp){


	__m64 res,res1;
	__m64 ptr = _mm_setzero_si64();
	__m64 mm_16 = _mm_set_pi16(16,16,16,16);
	
	short A = _mm_extract_pi16(temp[0],0);
	short B = _mm_extract_pi16(temp[1],0);
	short C = _mm_extract_pi16(temp[2],0);
	short D = _mm_extract_pi16(temp[3],0);
	short E = _mm_extract_pi16(temp[4],0);
	short F = _mm_extract_pi16(temp[5],0);

	unsigned int result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),0);

	 A = _mm_extract_pi16(temp[0],1);
	 B = _mm_extract_pi16(temp[1],1);
	 C = _mm_extract_pi16(temp[2],1);
	 D = _mm_extract_pi16(temp[3],1);
	 E = _mm_extract_pi16(temp[4],1);
	 F = _mm_extract_pi16(temp[5],1);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),1);

	 A = _mm_extract_pi16(temp[0],2);
	 B = _mm_extract_pi16(temp[1],2);
	 C = _mm_extract_pi16(temp[2],2);
	 D = _mm_extract_pi16(temp[3],2);
	 E = _mm_extract_pi16(temp[4],2);
	 F = _mm_extract_pi16(temp[5],2);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),2);

	 A = _mm_extract_pi16(temp[0],3);
	 B = _mm_extract_pi16(temp[1],3);
	 C = _mm_extract_pi16(temp[2],3);
	 D = _mm_extract_pi16(temp[3],3);
	 E = _mm_extract_pi16(temp[4],3);
	 F = _mm_extract_pi16(temp[5],3);

	result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512;
	ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),3);
	

	res = _mm_add_pi16(temp[3],mm_16);
	res1 = _mm_srai_pi16(res,5);
	res1 = _mm_max_pi16(res1,_mm_set_pi16(0,0,0,0));
	res1 = _mm_min_pi16(res1,_mm_set_pi16(255,255,255,255)); //Clip

	res =  _mm_avg_pu16(ptr,res1);//(ptr_img[0] + ptr_rf[0] + 1) >> 1*/
	
	return res;

}
Exemple #4
0
void r_dimpatchD_MMX(const DCanvas *const cvs, argb_t color, int alpha, int x1, int y1, int w, int h)
{
	int x, y, i;
	argb_t *line;
	int invAlpha = 256 - alpha;

	int dpitch = cvs->pitch / sizeof(DWORD);
	line = (argb_t *)cvs->buffer + y1 * dpitch;

	int batches = w / 2;
	int remainder = w & 1;

	// MMX temporaries:
	const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff);
	const __m64 blendAlpha = _mm_set_pi16(0, alpha, alpha, alpha);
	const __m64 blendInvAlpha = _mm_set_pi16(0, invAlpha, invAlpha, invAlpha);
	const __m64 blendColor = _mm_set_pi16(0, RPART(color), GPART(color), BPART(color));
	const __m64 blendMult = _mm_mullo_pi16(blendColor, blendAlpha);

	for (y = y1; y < y1 + h; y++)
	{
		// MMX optimize the bulk in batches of 2 colors:
		for (i = 0, x = x1; i < batches; ++i, x += 2)
		{
#if 1
			const __m64 input = _mm_setr_pi32(line[x + 0], line[x + 1]);
#else
			// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
			const __m64 input = *((__m64 *)line[x]);
#endif
			const __m64 output = blend2vs1_mmx(input, blendMult, blendInvAlpha, upper8mask);
#if 1
			line[x+0] = _mm_cvtsi64_si32(_mm_srli_si64(output, 32*0));
			line[x+1] = _mm_cvtsi64_si32(_mm_srli_si64(output, 32*1));
#else
			// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
			*((__m64 *)line[x]) = output;
#endif
		}

		if (remainder)
		{
			// Pick up the remainder:
			for (; x < x1 + w; x++)
			{
				line[x] = alphablend1a(line[x], color, alpha);
			}
		}

		line += dpitch;
	}

	// Required to reset FP:
	_mm_empty();
}
unsigned int interpolhline_mmx_2(unsigned char* image){

	__m64 mm_A = _mm_set_pi16(image[1],image[0],image[-1],image[-2]); 
	__m64 mm_B = _mm_set_pi16(image[2],image[1],image[0],image[-1]); 
	__m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); 
	__m64 mm_D = _mm_set_pi16(image[4],image[3],image[2],image[1]);
	__m64 mm_E = _mm_set_pi16(image[5],image[4],image[3],image[2]);
	__m64 mm_F = _mm_set_pi16(image[6],image[5],image[4],image[3]);

	__m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F
	__m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16
	__m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E
	__m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D
	__m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2
	__m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E)
	__m64 mm_5 = _mm_set_pi16(5,5,5,5);
	__m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5

	__m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5
	__m64 mm_zero = _mm_setzero_si64();
	__m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0
	__m64 mm_ret = _mm_srai_pi16(mm_clip,5);
	__m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255
	
	__m64 result =_mm_packs_pu16(mm_clip1,mm_zero);
	unsigned int ret = _mm_cvtsi64_si32(result);

	 empty(); 
	return ret;

}
Exemple #6
0
static void
TEST (void)
{
  __m64_union u, e, s1, s2;
  int i;

  s1.as_m64 = _mm_set_pi16 (1, 2, 3, 4);
  s2.as_m64 = _mm_set_pi16 (4, 3, 2, 1);
  u.as_m64 = test (s1.as_m64, s2.as_m64);

  for (i = 0; i < 4; i++)
    e.as_short[i] =
	s1.as_short[i] < s2.as_short[i] ? s1.as_short[i] : s2.as_short[i];

  if (u.as_m64 != e.as_m64)
    abort ();
}
__m64 test_mm_set_pi16(short a, short b, short c, short d) {
  // CHECK-LABEL: test_mm_set_pi16
  // CHECK: insertelement <4 x i16>
  // CHECK: insertelement <4 x i16>
  // CHECK: insertelement <4 x i16>
  // CHECK: insertelement <4 x i16>
  return _mm_set_pi16(a, b, c, d);
}
 __m64 interpolhline64_mmx(unsigned char* image){

 
	__m64 mm_A = _mm_set_pi16(image[3],image[2],image[1],image[0]); 
	__m64 mm_B = _mm_set_pi16(image[4],image[3],image[2],image[1]);
	__m64 mm_C = _mm_set_pi16(image[5],image[4],image[3],image[2]);
	__m64 mm_D = _mm_set_pi16(image[6],image[5],image[4],image[3]);
	__m64 mm_E = _mm_set_pi16(image[7],image[6],image[5],image[4]);
	__m64 mm_F = _mm_set_pi16(image[8],image[7],image[6],image[5]);


	__m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F
	__m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E
	__m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D
	__m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2
	__m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E)
	__m64 mm_5 = _mm_set_pi16(5,5,5,5);
	__m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5

	__m64 mm_result = _mm_add_pi16(mm_inter_3,mm_AF);//A + F + 16 + (((C + D) << 2)-(B + E))*5
	 

	return(mm_result);


}
Exemple #9
0
void weighted_merge_planar_mmx(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height, int weight, int invweight) {
  __m64 round_mask = _mm_set1_pi32(0x4000);
  __m64 zero = _mm_setzero_si64();
  __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight);

  int wMod8 = (width/8) * 8;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod8; x += 8) {
      __m64 px1 = *(reinterpret_cast<const __m64*>(p1+x)); //y7y6 y5y4 y3y2 y1y0
      __m64 px2 = *(reinterpret_cast<const __m64*>(p2+x)); //Y7Y6 Y5Y4 Y3Y2 Y1Y0

      __m64 p0123 = _mm_unpacklo_pi8(px1, px2); //Y3y3 Y2y2 Y1y1 Y0y0
      __m64 p4567 = _mm_unpackhi_pi8(px1, px2); //Y7y7 Y6y6 Y5y5 Y4y4

      __m64 p01 = _mm_unpacklo_pi8(p0123, zero); //00Y1 00y1 00Y0 00y0
      __m64 p23 = _mm_unpackhi_pi8(p0123, zero); //00Y3 00y3 00Y2 00y2
      __m64 p45 = _mm_unpacklo_pi8(p4567, zero); //00Y5 00y5 00Y4 00y4
      __m64 p67 = _mm_unpackhi_pi8(p4567, zero); //00Y7 00y7 00Y6 00y6

      p01 = _mm_madd_pi16(p01, mask);
      p23 = _mm_madd_pi16(p23, mask);
      p45 = _mm_madd_pi16(p45, mask);
      p67 = _mm_madd_pi16(p67, mask);

      p01 = _mm_add_pi32(p01, round_mask);
      p23 = _mm_add_pi32(p23, round_mask);
      p45 = _mm_add_pi32(p45, round_mask);
      p67 = _mm_add_pi32(p67, round_mask);

      p01 = _mm_srli_pi32(p01, 15);
      p23 = _mm_srli_pi32(p23, 15);
      p45 = _mm_srli_pi32(p45, 15);
      p67 = _mm_srli_pi32(p67, 15);

      p0123 = _mm_packs_pi32(p01, p23);
      p4567 = _mm_packs_pi32(p45, p67);

      __m64 result = _mm_packs_pu16(p0123, p4567);

      *reinterpret_cast<__m64*>(p1+x) = result;
    }

    for (int x = wMod8; x < width; x++) {
      p1[x] = (p1[x]*invweight + p2[x]*weight + 16384) >> 15;
    }

    p1 += p1_pitch;
    p2 += p2_pitch;
  }
  _mm_empty();
}
Exemple #10
0
static void weighted_merge_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight)
{
  __m64 round_mask = _mm_set1_pi32(0x4000);
  __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight);
  __m64 luma_mask = _mm_set1_pi16(0x00FF);
#pragma warning(push)
#pragma warning(disable: 4309)
  __m64 chroma_mask = _mm_set1_pi16(0xFF00);
#pragma warning(pop)

  int wMod8 = (width/8) * 8;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod8; x += 8) {
      __m64 px1 = *reinterpret_cast<const __m64*>(src+x); //V1 Y3 U1 Y2 V0 Y1 U0 Y0
      __m64 px2 = *reinterpret_cast<const __m64*>(luma+x); //v1 y3 u1 y2 v0 y1 u0 y0

      __m64 src_lo = _mm_unpacklo_pi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0
      __m64 src_hi = _mm_unpackhi_pi16(px1, px2); 

      src_lo = _mm_and_si64(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0
      src_hi = _mm_and_si64(src_hi, luma_mask); 

      src_lo = _mm_madd_pi16(src_lo, mask);
      src_hi = _mm_madd_pi16(src_hi, mask);

      src_lo = _mm_add_pi32(src_lo, round_mask);
      src_hi = _mm_add_pi32(src_hi, round_mask);

      src_lo = _mm_srli_pi32(src_lo, 15);
      src_hi = _mm_srli_pi32(src_hi, 15);

      __m64 result_luma = _mm_packs_pi32(src_lo, src_hi);

      __m64 result_chroma = _mm_and_si64(px1, chroma_mask);
      __m64 result = _mm_or_si64(result_chroma, result_luma);

      *reinterpret_cast<__m64*>(src+x) = result;
    }

    for (int x = wMod8; x < width; x+=2) {
      src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15;
    }

    src += pitch;
    luma += luma_pitch;
  }
  _mm_empty();
}
//n_2
 __m64 interpolvline64_mmx(unsigned char* image, const unsigned short PicWidthInPix){

 
	__m64 mm_A = _mm_set_pi16(image[1 * PicWidthInPix],image[0],image[-1 * PicWidthInPix],image[-2 * PicWidthInPix]); 
	__m64 mm_B = _mm_set_pi16(image[2 * PicWidthInPix],image[1 * PicWidthInPix],image[0],image[-1 * PicWidthInPix]); 
	__m64 mm_C = _mm_set_pi16(image[3 * PicWidthInPix],image[2 * PicWidthInPix],image[1 * PicWidthInPix],image[0]); 
	__m64 mm_D = _mm_set_pi16(image[4 * PicWidthInPix],image[3 * PicWidthInPix],image[2 * PicWidthInPix],image[1 * PicWidthInPix]);
	__m64 mm_E = _mm_set_pi16(image[5 * PicWidthInPix],image[4 * PicWidthInPix],image[3 * PicWidthInPix],image[2 * PicWidthInPix]);
	__m64 mm_F = _mm_set_pi16(image[6 * PicWidthInPix],image[5 * PicWidthInPix],image[4 * PicWidthInPix],image[3 * PicWidthInPix]);


	__m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F
	__m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E
	__m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D
	__m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2
	__m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E)
	__m64 mm_5 = _mm_set_pi16(5,5,5,5);
	__m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5

	__m64 mm_result = _mm_add_pi16(mm_inter_3,mm_AF);//A + F + 16 + (((C + D) << 2)-(B + E))*5

	 empty();  
	return(mm_result);
}
Exemple #12
0
void rtv_lucent4cols_MMX(byte *source, argb_t *dest, int bga, int fga)
{
	// SSE2 temporaries:
	const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff);
	const __m64 fgAlpha = _mm_set_pi16(0, fga, fga, fga);
	const __m64 bgAlpha = _mm_set_pi16(0, bga, bga, bga);

#if 1
	const __m64 bgColors01 = _mm_setr_pi32(dest[0], dest[1]);
#else
	const __m64 bgColors01 = *((__m64 *)&dest[0]);
#endif
	const __m64 fgColors01 = _mm_setr_pi32(
		rt_mapcolor<argb_t>(dcol.colormap, source[0]),
		rt_mapcolor<argb_t>(dcol.colormap, source[1])
	);

	const __m64 finalColors01 = _mm_packs_pu16(
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors01, bgColors01), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors01, fgColors01), upper8mask), fgAlpha)
			),
			8
		),
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors01, bgColors01), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors01, fgColors01), upper8mask), fgAlpha)
			),
			8
		)
	);

#if 1
	const __m64 bgColors23 = _mm_setr_pi32(dest[2], dest[3]);
#else
	// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
	const __m64 bgColors23 = *((__m64 *)&dest[2]);
#endif
	const __m64 fgColors23 = _mm_setr_pi32(
		rt_mapcolor<argb_t>(dcol.colormap, source[2]),
		rt_mapcolor<argb_t>(dcol.colormap, source[3])
	);

	const __m64 finalColors23 = _mm_packs_pu16(
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors23, bgColors23), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors23, fgColors23), upper8mask), fgAlpha)
			),
			8
		),
		_mm_srli_pi16(
			_mm_adds_pi16(
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors23, bgColors23), upper8mask), bgAlpha),
				_mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors23, fgColors23), upper8mask), fgAlpha)
			),
			8
		)
	);
	
#if 1
	dest[0] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*0));
	dest[1] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*1));
	dest[2] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*0));
	dest[3] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*1));
#else
	// NOTE(jsd): No guarantee of 64-bit alignment; cannot use.
	*((__m64 *)&dest[0]) = finalColors01;
	*((__m64 *)&dest[2]) = finalColors23;
#endif

	// Required to reset FP:
	_mm_empty();
}
//mbl test with Pocket_PC		
void weak_horizontal_luma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha, const unsigned char beta, const unsigned char tc0){

	__m64 mp2 = _mm_set_pi16(pix[-3*xstride + 3], pix[-3*xstride + 2], pix[-3*xstride + 1], pix[-3*xstride]);
	__m64 mp1 = _mm_set_pi16(pix[-2*xstride + 3], pix[-2*xstride + 2], pix[-2*xstride + 1], pix[-2*xstride]); 
	__m64 mp0 = _mm_set_pi16(pix[-1*xstride + 3], pix[-1*xstride + 2], pix[-1*xstride + 1], pix[-1*xstride]); 
	__m64 mq0 = _mm_set_pi16(pix[3], pix[2], pix[1], pix[0]);
	__m64 mq1 = _mm_set_pi16(pix[xstride + 3], pix[xstride + 2], pix[xstride + 1], pix[xstride]);
	__m64 mq2 = _mm_set_pi16(pix[2*xstride + 3], pix[2*xstride + 2], pix[2*xstride + 1], pix[2*xstride]);
	
	__m64 mrshift = _mm_set_pi16(0,0,0,1);
	__m64 maddp0_q0 =  _mm_avg_pu8(mp0,mq0); //addp0_q0 = (p0 + q0 + 1) >> 1;
	__m64 maddp2 = _mm_add_pi16(maddp0_q0,mp2); //addp2 = (p2 + addp0_q0);
	__m64 maddq2 = _mm_add_pi16(maddp0_q0,mq2); //addp2 = (p2 + addp0_q0);
	__m64 maddp2_s = _mm_srl_pi16(maddp2,mrshift); //addp2 = (p2 + addp0_q0) >> 1;
	__m64 maddq2_s = _mm_srl_pi16(maddq2,mrshift); //addp2 = (p2 + addp0_q0) >> 1;
	__m64 mp1_c = _mm_sub_pi16(maddp2_s, mp1); //addp2 - p1
	__m64 mq1_c = _mm_sub_pi16(maddq2_s, mq1); // addq2 - q1



	//To calculate the mask
 	__m64 malpha = _mm_set_pi16(alpha,alpha,alpha,alpha);
	__m64 malphab = _mm_set_pi16(-alpha,-alpha,-alpha,-alpha);
	__m64 mbeta = _mm_set_pi16(beta,beta,beta,beta);
	__m64 mbetab = _mm_set_pi16(-beta,-beta,-beta,-beta);
	

	__m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0)
	__m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0)
	__m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0)
	__m64 mdiff_p2_p0 = _mm_sub_pi16(mp2,mp0); //abs(p2 - p0 ))
	__m64 mdiff_q2_q0 = _mm_sub_pi16(mq2,mq0); //abs(q2 - q0)

 	__m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab));
	__m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab));
	__m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab));
	__m64 mask3 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p2_p0), _mm_cmpgt_pi16(mdiff_p2_p0,mbetab));
	__m64 mask4 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q2_q0), _mm_cmpgt_pi16(mdiff_q2_q0,mbetab));
	
	__m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2); //(abs(q0 - p0) < alpha) && (abs(p1 - p0) < beta) && (abs(q1 - q0) < beta)
	__m64 second_mask = _mm_and_si64 (first_mask,mask3);
	__m64 third_mask = _mm_and_si64 (first_mask,mask4);


	__m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) 
	__m64 mlshift = _mm_set_pi16(0,0,0,2);
	__m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2;
	__m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1)
	__m64 madd4 = _mm_set_pi16(4,4,4,4);
	__m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4;
	__m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 +  inter_2)
	__m64 mrshift3 = _mm_set_pi16(0,0,0,3);
	__m64 minter5 = _mm_sra_pi16(minter_4, mrshift3);



	//Clip3
	__m64 m_tc0 = _mm_set_pi16(tc0,tc0,tc0,tc0);
	__m64 m_tcb0 = _mm_set_pi16(-tc0,-tc0,-tc0,-tc0);
	__m64 mres_c1 = _mm_min_pi16(_mm_max_pi16(mp1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 );
	__m64 mres_c2 = _mm_min_pi16(_mm_max_pi16(mq1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addq2 - q1 );
	__m64 merror0 = _mm_and_si64 (mres_c1,second_mask);
	__m64 merror1 = _mm_and_si64 (mres_c2,third_mask);


	__m64 m_1 = _mm_set_pi16(1,1,1,1);
	__m64 m_and1 = _mm_and_si64 (mask3, m_1); //tc++; if abs( p2 - p0 ) < beta 
	__m64 m_and2 = _mm_and_si64 (mask4, m_1); //tc++; if abs( q2 - q0  ) < beta 
	__m64 m_tc = _mm_add_pi16(m_and2,_mm_add_pi16(m_tc0,m_and1));
	__m64 m_tcn =_mm_sub_pi16(_mm_sub_pi16(m_tcb0,m_and1),m_and2);
	__m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcn),m_tc); //CLIP3(-tc0, tc0, addp2 - p1 );
	__m64 merror2 = _mm_and_si64 (mres_c3,first_mask);


	__m64 result_p1 = _mm_add_pi16(merror0,mp1); //_mm_shuffle_pi16(_mm_add_pi16(merror0,mp1), 0x1B);
	__m64 result_q1 = _mm_add_pi16(merror1,mq1); //_mm_shuffle_pi16(_mm_add_pi16(merror1,mq1), 0x1B);
	__m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B);
	__m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B);

	*((unsigned int* )(&pix[-2*xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p1,mrshift));
	*((unsigned int* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift));
	*((unsigned int* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift));
	*((unsigned int* )(&pix[xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q1,mrshift));

	empty();  
}
void weak_horizontal_chroma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha	, const unsigned char beta, const unsigned char tc0)
{
	
	__m64 mp1 = _mm_set_pi16( 0,0,pix[-2*xstride + 1], pix[-2*xstride]); 
	__m64 mp0 = _mm_set_pi16( 0,0,pix[-1*xstride + 1], pix[-1*xstride]); 
	__m64 mq0 = _mm_set_pi16( 0,0,pix[1], pix[0]);
	__m64 mq1 = _mm_set_pi16( 0,0,pix[xstride + 1], pix[xstride]);
	

	__m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0)
	__m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0)
	__m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0)

	//To calculate the mask
 	__m64 malpha = _mm_set_pi16(0,0,alpha,alpha);
	__m64 malphab = _mm_set_pi16(0,0,-alpha,-alpha);
	__m64 mbeta = _mm_set_pi16(0,0,beta,beta);
	__m64 mbetab = _mm_set_pi16(0,0,-beta,-beta);

	__m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab));
	__m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab));
	__m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab));
	__m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2);


	__m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) 
	__m64 mlshift = _mm_set_pi16(0,0,0,2);
	__m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2;
	__m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1)
	__m64 madd4 = _mm_set_pi16(4,4,4,4);
	__m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4;
	__m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 +  inter_2)
	__m64 mrshift3 = _mm_set_pi16(0,0,0,3);
	__m64 minter5 = _mm_sra_pi16(minter_4, mrshift3);



	//Clip3
	__m64 m_tc0 = _mm_set_pi16(0,0,tc0,tc0);
	__m64 m_tcb0 = _mm_set_pi16(0,0,-tc0,-tc0);
	__m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 );
	__m64 merror2 = _mm_and_si64 (mres_c3,first_mask);

	__m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B);
	__m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B);
	__m64 mrshift = _mm_set_pi16(0,0,0,1);

	*((unsigned short* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift));
	*((unsigned short* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift));


	empty();
}
Exemple #15
0
void AudioMixer::addBufferToMixForListeningNodeWithBuffer(PositionalAudioRingBuffer* bufferToAdd,
                                                          AvatarAudioRingBuffer* listeningNodeBuffer) {
    float bearingRelativeAngleToSource = 0.0f;
    float attenuationCoefficient = 1.0f;
    int numSamplesDelay = 0;
    float weakChannelAmplitudeRatio = 1.0f;
    
    if (bufferToAdd != listeningNodeBuffer) {
        // if the two buffer pointers do not match then these are different buffers

        glm::vec3 relativePosition = bufferToAdd->getPosition() - listeningNodeBuffer->getPosition();
        glm::quat inverseOrientation = glm::inverse(listeningNodeBuffer->getOrientation());

        float distanceSquareToSource = glm::dot(relativePosition, relativePosition);
        float radius = 0.0f;

        if (bufferToAdd->getType() == PositionalAudioRingBuffer::Injector) {
            InjectedAudioRingBuffer* injectedBuffer = (InjectedAudioRingBuffer*) bufferToAdd;
            radius = injectedBuffer->getRadius();
            attenuationCoefficient *= injectedBuffer->getAttenuationRatio();
        }

        if (radius == 0 || (distanceSquareToSource > radius * radius)) {
            // this is either not a spherical source, or the listener is outside the sphere

            if (radius > 0) {
                // this is a spherical source - the distance used for the coefficient
                // needs to be the closest point on the boundary to the source

                // ovveride the distance to the node with the distance to the point on the
                // boundary of the sphere
                distanceSquareToSource -= (radius * radius);

            } else {
                // calculate the angle delivery for off-axis attenuation
                glm::vec3 rotatedListenerPosition = glm::inverse(bufferToAdd->getOrientation()) * relativePosition;

                float angleOfDelivery = glm::angle(glm::vec3(0.0f, 0.0f, -1.0f),
                                                   glm::normalize(rotatedListenerPosition));

                const float MAX_OFF_AXIS_ATTENUATION = 0.2f;
                const float OFF_AXIS_ATTENUATION_FORMULA_STEP = (1 - MAX_OFF_AXIS_ATTENUATION) / 2.0f;

                float offAxisCoefficient = MAX_OFF_AXIS_ATTENUATION +
                    (OFF_AXIS_ATTENUATION_FORMULA_STEP * (angleOfDelivery / PI_OVER_TWO));

                // multiply the current attenuation coefficient by the calculated off axis coefficient
                attenuationCoefficient *= offAxisCoefficient;
            }

            glm::vec3 rotatedSourcePosition = inverseOrientation * relativePosition;

            const float DISTANCE_SCALE = 2.5f;
            const float GEOMETRIC_AMPLITUDE_SCALAR = 0.3f;
            const float DISTANCE_LOG_BASE = 2.5f;
            const float DISTANCE_SCALE_LOG = logf(DISTANCE_SCALE) / logf(DISTANCE_LOG_BASE);

            // calculate the distance coefficient using the distance to this node
            float distanceCoefficient = powf(GEOMETRIC_AMPLITUDE_SCALAR,
                                             DISTANCE_SCALE_LOG +
                                             (0.5f * logf(distanceSquareToSource) / logf(DISTANCE_LOG_BASE)) - 1);
            distanceCoefficient = std::min(1.0f, distanceCoefficient);

            // multiply the current attenuation coefficient by the distance coefficient
            attenuationCoefficient *= distanceCoefficient;

            // project the rotated source position vector onto the XZ plane
            rotatedSourcePosition.y = 0.0f;

            // produce an oriented angle about the y-axis
            bearingRelativeAngleToSource = glm::orientedAngle(glm::vec3(0.0f, 0.0f, -1.0f),
                                                              glm::normalize(rotatedSourcePosition),
                                                              glm::vec3(0.0f, 1.0f, 0.0f));

            const float PHASE_AMPLITUDE_RATIO_AT_90 = 0.5;

            // figure out the number of samples of delay and the ratio of the amplitude
            // in the weak channel for audio spatialization
            float sinRatio = fabsf(sinf(bearingRelativeAngleToSource));
            numSamplesDelay = SAMPLE_PHASE_DELAY_AT_90 * sinRatio;
            weakChannelAmplitudeRatio = 1 - (PHASE_AMPLITUDE_RATIO_AT_90 * sinRatio);
        }
    }

    // if the bearing relative angle to source is > 0 then the delayed channel is the right one
    int delayedChannelOffset = (bearingRelativeAngleToSource > 0.0f) ? 1 : 0;
    int goodChannelOffset = delayedChannelOffset == 0 ? 1 : 0;
    
    const int16_t* nextOutputStart = bufferToAdd->getNextOutput();
   
    const int16_t* bufferStart = bufferToAdd->getBuffer();
    int ringBufferSampleCapacity = bufferToAdd->getSampleCapacity();

    int16_t correctBufferSample[2], delayBufferSample[2];
    int delayedChannelIndex = 0;
    
    const int SINGLE_STEREO_OFFSET = 2;
    
    for (int s = 0; s < NETWORK_BUFFER_LENGTH_SAMPLES_STEREO; s += 4) {
        
        // setup the int16_t variables for the two sample sets
        correctBufferSample[0] = nextOutputStart[s / 2] * attenuationCoefficient;
        correctBufferSample[1] = nextOutputStart[(s / 2) + 1] * attenuationCoefficient;
        
        delayedChannelIndex = s + (numSamplesDelay * 2) + delayedChannelOffset;
        
        delayBufferSample[0] = correctBufferSample[0] * weakChannelAmplitudeRatio;
        delayBufferSample[1] = correctBufferSample[1] * weakChannelAmplitudeRatio;
        
        __m64 bufferSamples = _mm_set_pi16(_clientSamples[s + goodChannelOffset],
                                           _clientSamples[s + goodChannelOffset + SINGLE_STEREO_OFFSET],
                                           _clientSamples[delayedChannelIndex],
                                           _clientSamples[delayedChannelIndex + SINGLE_STEREO_OFFSET]);
        __m64 addedSamples = _mm_set_pi16(correctBufferSample[0], correctBufferSample[1],
                                         delayBufferSample[0], delayBufferSample[1]);
        
        // perform the MMX add (with saturation) of two correct and delayed samples
        __m64 mmxResult = _mm_adds_pi16(bufferSamples, addedSamples);
        int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
        
        // assign the results from the result of the mmx arithmetic
        _clientSamples[s + goodChannelOffset] = shortResults[3];
        _clientSamples[s + goodChannelOffset + SINGLE_STEREO_OFFSET] = shortResults[2];
        _clientSamples[delayedChannelIndex] = shortResults[1];
        _clientSamples[delayedChannelIndex + SINGLE_STEREO_OFFSET] = shortResults[0];
    }
    
    // The following code is pretty gross and redundant, but AFAIK it's the best way to avoid
    // too many conditionals in handling the delay samples at the beginning of _clientSamples.
    // Basically we try to take the samples in batches of four, and then handle the remainder
    // conditionally to get rid of the rest.
    
    const int DOUBLE_STEREO_OFFSET = 4;
    const int TRIPLE_STEREO_OFFSET = 6;
    
    if (numSamplesDelay > 0) {
        // if there was a sample delay for this buffer, we need to pull samples prior to the nextOutput
        // to stick at the beginning
        float attenuationAndWeakChannelRatio = attenuationCoefficient * weakChannelAmplitudeRatio;
        const int16_t* delayNextOutputStart = nextOutputStart - numSamplesDelay;
        if (delayNextOutputStart < bufferStart) {
            delayNextOutputStart = bufferStart + ringBufferSampleCapacity - numSamplesDelay;
        }
        
        int i = 0;
        
        while (i + 3 < numSamplesDelay) {
            // handle the first cases where we can MMX add four samples at once
            int parentIndex = i * 2;
            __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset],
                                               _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset],
                                               _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset],
                                               _clientSamples[parentIndex + TRIPLE_STEREO_OFFSET + delayedChannelOffset]);
            __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 2] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 3] * attenuationAndWeakChannelRatio);
            __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples);
            int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
            
            _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3];
            _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2];
            _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[1];
            _clientSamples[parentIndex + TRIPLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[0];
            
            // push the index
            i += 4;
        }
        
        int parentIndex = i * 2;
        
        if (i + 2 < numSamplesDelay) {
            // MMX add only three delayed samples
            
            __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset],
                                               _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset],
                                               _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset],
                                               0);
            __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 2] * attenuationAndWeakChannelRatio,
                                            0);
            __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples);
            int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
            
            _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3];
            _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2];
            _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[1];
            
        } else if (i + 1 < numSamplesDelay) {
            // MMX add two delayed samples
            __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset],
                                               _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset], 0, 0);
            __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio,
                                            delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio, 0, 0);
            
            __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples);
            int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
            
            _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3];
            _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2];
            
        } else if (i < numSamplesDelay) {
            // MMX add a single delayed sample
            __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset], 0, 0, 0);
            __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio, 0, 0, 0);
            
            __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples);
            int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
            
            _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3];
        }
    }
}