static void TEST (void) { __m64_union u, s1, s2; __m64_union e; int i, tmp; s1.as_m64 = _mm_set_pi16 (10, 20, 30, 40); s2.as_m64 = _mm_set_pi16 (11, 98, 76, 100); u.as_m64 = test (s1.as_m64, s2.as_m64); for (i = 0; i < 4; i++) { tmp = (unsigned short)s1.as_short[i] - (unsigned short)s2.as_short[i]; if (tmp > 65535) tmp = -1; if (tmp < 0) tmp = 0; e.as_short[i] = tmp; } if (u.as_m64 != e.as_m64) abort (); }
//For 0_n unsigned int interpolvline_mmx_1(unsigned char* image, unsigned char * ref, int PicWidthInPix){ __m64 mm_ref = _mm_set_pi16(ref[3],ref[2],ref[1],ref[0]); __m64 mm_A = _mm_set_pi16(image[-2 * PicWidthInPix + 3],image[-2 * PicWidthInPix + 2],image[-2 * PicWidthInPix + 1],image[-2 * PicWidthInPix]); __m64 mm_B = _mm_set_pi16(image[-1 * PicWidthInPix + 3],image[-1 * PicWidthInPix + 2],image[-1 * PicWidthInPix + 1],image[-1 * PicWidthInPix]); __m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); __m64 mm_D = _mm_set_pi16(image[1 * PicWidthInPix + 3],image[1 * PicWidthInPix + 2],image[1 * PicWidthInPix + 1],image[1 * PicWidthInPix]); __m64 mm_E = _mm_set_pi16(image[2 * PicWidthInPix + 3],image[2 * PicWidthInPix + 2],image[2 * PicWidthInPix + 1],image[2 * PicWidthInPix]); __m64 mm_F = _mm_set_pi16(image[3 * PicWidthInPix + 3],image[3 * PicWidthInPix + 2],image[3 * PicWidthInPix + 1],image[3 * PicWidthInPix]); __m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F __m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16 __m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E __m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D __m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2 __m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E) __m64 mm_5 = _mm_set_pi16(5,5,5,5); __m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5 __m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5 __m64 mm_zero = _mm_setzero_si64(); __m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0 __m64 mm_ret = _mm_srai_pi16(mm_clip,5); __m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255 __m64 test = _mm_avg_pu8(mm_clip1,mm_ref);//(ptr_img[0] + ptr_rf[0] + 1) >> 1 __m64 test1 =_mm_packs_pu16(test,mm_zero); unsigned int ret = _mm_cvtsi64_si32(test1); empty(); return ret; }
__m64 interpolhline64_3_mmx(__m64* temp){ __m64 res,res1; __m64 ptr = _mm_setzero_si64(); __m64 mm_16 = _mm_set_pi16(16,16,16,16); short A = _mm_extract_pi16(temp[0],0); short B = _mm_extract_pi16(temp[1],0); short C = _mm_extract_pi16(temp[2],0); short D = _mm_extract_pi16(temp[3],0); short E = _mm_extract_pi16(temp[4],0); short F = _mm_extract_pi16(temp[5],0); unsigned int result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),0); A = _mm_extract_pi16(temp[0],1); B = _mm_extract_pi16(temp[1],1); C = _mm_extract_pi16(temp[2],1); D = _mm_extract_pi16(temp[3],1); E = _mm_extract_pi16(temp[4],1); F = _mm_extract_pi16(temp[5],1); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),1); A = _mm_extract_pi16(temp[0],2); B = _mm_extract_pi16(temp[1],2); C = _mm_extract_pi16(temp[2],2); D = _mm_extract_pi16(temp[3],2); E = _mm_extract_pi16(temp[4],2); F = _mm_extract_pi16(temp[5],2); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),2); A = _mm_extract_pi16(temp[0],3); B = _mm_extract_pi16(temp[1],3); C = _mm_extract_pi16(temp[2],3); D = _mm_extract_pi16(temp[3],3); E = _mm_extract_pi16(temp[4],3); F = _mm_extract_pi16(temp[5],3); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),3); res = _mm_add_pi16(temp[3],mm_16); res1 = _mm_srai_pi16(res,5); res1 = _mm_max_pi16(res1,_mm_set_pi16(0,0,0,0)); res1 = _mm_min_pi16(res1,_mm_set_pi16(255,255,255,255)); //Clip res = _mm_avg_pu16(ptr,res1);//(ptr_img[0] + ptr_rf[0] + 1) >> 1*/ return res; }
void r_dimpatchD_MMX(const DCanvas *const cvs, argb_t color, int alpha, int x1, int y1, int w, int h) { int x, y, i; argb_t *line; int invAlpha = 256 - alpha; int dpitch = cvs->pitch / sizeof(DWORD); line = (argb_t *)cvs->buffer + y1 * dpitch; int batches = w / 2; int remainder = w & 1; // MMX temporaries: const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff); const __m64 blendAlpha = _mm_set_pi16(0, alpha, alpha, alpha); const __m64 blendInvAlpha = _mm_set_pi16(0, invAlpha, invAlpha, invAlpha); const __m64 blendColor = _mm_set_pi16(0, RPART(color), GPART(color), BPART(color)); const __m64 blendMult = _mm_mullo_pi16(blendColor, blendAlpha); for (y = y1; y < y1 + h; y++) { // MMX optimize the bulk in batches of 2 colors: for (i = 0, x = x1; i < batches; ++i, x += 2) { #if 1 const __m64 input = _mm_setr_pi32(line[x + 0], line[x + 1]); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. const __m64 input = *((__m64 *)line[x]); #endif const __m64 output = blend2vs1_mmx(input, blendMult, blendInvAlpha, upper8mask); #if 1 line[x+0] = _mm_cvtsi64_si32(_mm_srli_si64(output, 32*0)); line[x+1] = _mm_cvtsi64_si32(_mm_srli_si64(output, 32*1)); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. *((__m64 *)line[x]) = output; #endif } if (remainder) { // Pick up the remainder: for (; x < x1 + w; x++) { line[x] = alphablend1a(line[x], color, alpha); } } line += dpitch; } // Required to reset FP: _mm_empty(); }
unsigned int interpolhline_mmx_2(unsigned char* image){ __m64 mm_A = _mm_set_pi16(image[1],image[0],image[-1],image[-2]); __m64 mm_B = _mm_set_pi16(image[2],image[1],image[0],image[-1]); __m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); __m64 mm_D = _mm_set_pi16(image[4],image[3],image[2],image[1]); __m64 mm_E = _mm_set_pi16(image[5],image[4],image[3],image[2]); __m64 mm_F = _mm_set_pi16(image[6],image[5],image[4],image[3]); __m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F __m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16 __m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E __m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D __m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2 __m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E) __m64 mm_5 = _mm_set_pi16(5,5,5,5); __m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5 __m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5 __m64 mm_zero = _mm_setzero_si64(); __m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0 __m64 mm_ret = _mm_srai_pi16(mm_clip,5); __m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255 __m64 result =_mm_packs_pu16(mm_clip1,mm_zero); unsigned int ret = _mm_cvtsi64_si32(result); empty(); return ret; }
static void TEST (void) { __m64_union u, e, s1, s2; int i; s1.as_m64 = _mm_set_pi16 (1, 2, 3, 4); s2.as_m64 = _mm_set_pi16 (4, 3, 2, 1); u.as_m64 = test (s1.as_m64, s2.as_m64); for (i = 0; i < 4; i++) e.as_short[i] = s1.as_short[i] < s2.as_short[i] ? s1.as_short[i] : s2.as_short[i]; if (u.as_m64 != e.as_m64) abort (); }
__m64 test_mm_set_pi16(short a, short b, short c, short d) { // CHECK-LABEL: test_mm_set_pi16 // CHECK: insertelement <4 x i16> // CHECK: insertelement <4 x i16> // CHECK: insertelement <4 x i16> // CHECK: insertelement <4 x i16> return _mm_set_pi16(a, b, c, d); }
__m64 interpolhline64_mmx(unsigned char* image){ __m64 mm_A = _mm_set_pi16(image[3],image[2],image[1],image[0]); __m64 mm_B = _mm_set_pi16(image[4],image[3],image[2],image[1]); __m64 mm_C = _mm_set_pi16(image[5],image[4],image[3],image[2]); __m64 mm_D = _mm_set_pi16(image[6],image[5],image[4],image[3]); __m64 mm_E = _mm_set_pi16(image[7],image[6],image[5],image[4]); __m64 mm_F = _mm_set_pi16(image[8],image[7],image[6],image[5]); __m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F __m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E __m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D __m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2 __m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E) __m64 mm_5 = _mm_set_pi16(5,5,5,5); __m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5 __m64 mm_result = _mm_add_pi16(mm_inter_3,mm_AF);//A + F + 16 + (((C + D) << 2)-(B + E))*5 return(mm_result); }
void weighted_merge_planar_mmx(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height, int weight, int invweight) { __m64 round_mask = _mm_set1_pi32(0x4000); __m64 zero = _mm_setzero_si64(); __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight); int wMod8 = (width/8) * 8; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod8; x += 8) { __m64 px1 = *(reinterpret_cast<const __m64*>(p1+x)); //y7y6 y5y4 y3y2 y1y0 __m64 px2 = *(reinterpret_cast<const __m64*>(p2+x)); //Y7Y6 Y5Y4 Y3Y2 Y1Y0 __m64 p0123 = _mm_unpacklo_pi8(px1, px2); //Y3y3 Y2y2 Y1y1 Y0y0 __m64 p4567 = _mm_unpackhi_pi8(px1, px2); //Y7y7 Y6y6 Y5y5 Y4y4 __m64 p01 = _mm_unpacklo_pi8(p0123, zero); //00Y1 00y1 00Y0 00y0 __m64 p23 = _mm_unpackhi_pi8(p0123, zero); //00Y3 00y3 00Y2 00y2 __m64 p45 = _mm_unpacklo_pi8(p4567, zero); //00Y5 00y5 00Y4 00y4 __m64 p67 = _mm_unpackhi_pi8(p4567, zero); //00Y7 00y7 00Y6 00y6 p01 = _mm_madd_pi16(p01, mask); p23 = _mm_madd_pi16(p23, mask); p45 = _mm_madd_pi16(p45, mask); p67 = _mm_madd_pi16(p67, mask); p01 = _mm_add_pi32(p01, round_mask); p23 = _mm_add_pi32(p23, round_mask); p45 = _mm_add_pi32(p45, round_mask); p67 = _mm_add_pi32(p67, round_mask); p01 = _mm_srli_pi32(p01, 15); p23 = _mm_srli_pi32(p23, 15); p45 = _mm_srli_pi32(p45, 15); p67 = _mm_srli_pi32(p67, 15); p0123 = _mm_packs_pi32(p01, p23); p4567 = _mm_packs_pi32(p45, p67); __m64 result = _mm_packs_pu16(p0123, p4567); *reinterpret_cast<__m64*>(p1+x) = result; } for (int x = wMod8; x < width; x++) { p1[x] = (p1[x]*invweight + p2[x]*weight + 16384) >> 15; } p1 += p1_pitch; p2 += p2_pitch; } _mm_empty(); }
static void weighted_merge_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight) { __m64 round_mask = _mm_set1_pi32(0x4000); __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight); __m64 luma_mask = _mm_set1_pi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m64 chroma_mask = _mm_set1_pi16(0xFF00); #pragma warning(pop) int wMod8 = (width/8) * 8; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod8; x += 8) { __m64 px1 = *reinterpret_cast<const __m64*>(src+x); //V1 Y3 U1 Y2 V0 Y1 U0 Y0 __m64 px2 = *reinterpret_cast<const __m64*>(luma+x); //v1 y3 u1 y2 v0 y1 u0 y0 __m64 src_lo = _mm_unpacklo_pi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0 __m64 src_hi = _mm_unpackhi_pi16(px1, px2); src_lo = _mm_and_si64(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0 src_hi = _mm_and_si64(src_hi, luma_mask); src_lo = _mm_madd_pi16(src_lo, mask); src_hi = _mm_madd_pi16(src_hi, mask); src_lo = _mm_add_pi32(src_lo, round_mask); src_hi = _mm_add_pi32(src_hi, round_mask); src_lo = _mm_srli_pi32(src_lo, 15); src_hi = _mm_srli_pi32(src_hi, 15); __m64 result_luma = _mm_packs_pi32(src_lo, src_hi); __m64 result_chroma = _mm_and_si64(px1, chroma_mask); __m64 result = _mm_or_si64(result_chroma, result_luma); *reinterpret_cast<__m64*>(src+x) = result; } for (int x = wMod8; x < width; x+=2) { src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15; } src += pitch; luma += luma_pitch; } _mm_empty(); }
//n_2 __m64 interpolvline64_mmx(unsigned char* image, const unsigned short PicWidthInPix){ __m64 mm_A = _mm_set_pi16(image[1 * PicWidthInPix],image[0],image[-1 * PicWidthInPix],image[-2 * PicWidthInPix]); __m64 mm_B = _mm_set_pi16(image[2 * PicWidthInPix],image[1 * PicWidthInPix],image[0],image[-1 * PicWidthInPix]); __m64 mm_C = _mm_set_pi16(image[3 * PicWidthInPix],image[2 * PicWidthInPix],image[1 * PicWidthInPix],image[0]); __m64 mm_D = _mm_set_pi16(image[4 * PicWidthInPix],image[3 * PicWidthInPix],image[2 * PicWidthInPix],image[1 * PicWidthInPix]); __m64 mm_E = _mm_set_pi16(image[5 * PicWidthInPix],image[4 * PicWidthInPix],image[3 * PicWidthInPix],image[2 * PicWidthInPix]); __m64 mm_F = _mm_set_pi16(image[6 * PicWidthInPix],image[5 * PicWidthInPix],image[4 * PicWidthInPix],image[3 * PicWidthInPix]); __m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F __m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E __m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D __m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2 __m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E) __m64 mm_5 = _mm_set_pi16(5,5,5,5); __m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5 __m64 mm_result = _mm_add_pi16(mm_inter_3,mm_AF);//A + F + 16 + (((C + D) << 2)-(B + E))*5 empty(); return(mm_result); }
void rtv_lucent4cols_MMX(byte *source, argb_t *dest, int bga, int fga) { // SSE2 temporaries: const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff); const __m64 fgAlpha = _mm_set_pi16(0, fga, fga, fga); const __m64 bgAlpha = _mm_set_pi16(0, bga, bga, bga); #if 1 const __m64 bgColors01 = _mm_setr_pi32(dest[0], dest[1]); #else const __m64 bgColors01 = *((__m64 *)&dest[0]); #endif const __m64 fgColors01 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[0]), rt_mapcolor<argb_t>(dcol.colormap, source[1]) ); const __m64 finalColors01 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ) ); #if 1 const __m64 bgColors23 = _mm_setr_pi32(dest[2], dest[3]); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. const __m64 bgColors23 = *((__m64 *)&dest[2]); #endif const __m64 fgColors23 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[2]), rt_mapcolor<argb_t>(dcol.colormap, source[3]) ); const __m64 finalColors23 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ) ); #if 1 dest[0] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*0)); dest[1] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*1)); dest[2] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*0)); dest[3] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*1)); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. *((__m64 *)&dest[0]) = finalColors01; *((__m64 *)&dest[2]) = finalColors23; #endif // Required to reset FP: _mm_empty(); }
//mbl test with Pocket_PC void weak_horizontal_luma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha, const unsigned char beta, const unsigned char tc0){ __m64 mp2 = _mm_set_pi16(pix[-3*xstride + 3], pix[-3*xstride + 2], pix[-3*xstride + 1], pix[-3*xstride]); __m64 mp1 = _mm_set_pi16(pix[-2*xstride + 3], pix[-2*xstride + 2], pix[-2*xstride + 1], pix[-2*xstride]); __m64 mp0 = _mm_set_pi16(pix[-1*xstride + 3], pix[-1*xstride + 2], pix[-1*xstride + 1], pix[-1*xstride]); __m64 mq0 = _mm_set_pi16(pix[3], pix[2], pix[1], pix[0]); __m64 mq1 = _mm_set_pi16(pix[xstride + 3], pix[xstride + 2], pix[xstride + 1], pix[xstride]); __m64 mq2 = _mm_set_pi16(pix[2*xstride + 3], pix[2*xstride + 2], pix[2*xstride + 1], pix[2*xstride]); __m64 mrshift = _mm_set_pi16(0,0,0,1); __m64 maddp0_q0 = _mm_avg_pu8(mp0,mq0); //addp0_q0 = (p0 + q0 + 1) >> 1; __m64 maddp2 = _mm_add_pi16(maddp0_q0,mp2); //addp2 = (p2 + addp0_q0); __m64 maddq2 = _mm_add_pi16(maddp0_q0,mq2); //addp2 = (p2 + addp0_q0); __m64 maddp2_s = _mm_srl_pi16(maddp2,mrshift); //addp2 = (p2 + addp0_q0) >> 1; __m64 maddq2_s = _mm_srl_pi16(maddq2,mrshift); //addp2 = (p2 + addp0_q0) >> 1; __m64 mp1_c = _mm_sub_pi16(maddp2_s, mp1); //addp2 - p1 __m64 mq1_c = _mm_sub_pi16(maddq2_s, mq1); // addq2 - q1 //To calculate the mask __m64 malpha = _mm_set_pi16(alpha,alpha,alpha,alpha); __m64 malphab = _mm_set_pi16(-alpha,-alpha,-alpha,-alpha); __m64 mbeta = _mm_set_pi16(beta,beta,beta,beta); __m64 mbetab = _mm_set_pi16(-beta,-beta,-beta,-beta); __m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0) __m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0) __m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0) __m64 mdiff_p2_p0 = _mm_sub_pi16(mp2,mp0); //abs(p2 - p0 )) __m64 mdiff_q2_q0 = _mm_sub_pi16(mq2,mq0); //abs(q2 - q0) __m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab)); __m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab)); __m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab)); __m64 mask3 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p2_p0), _mm_cmpgt_pi16(mdiff_p2_p0,mbetab)); __m64 mask4 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q2_q0), _mm_cmpgt_pi16(mdiff_q2_q0,mbetab)); __m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2); //(abs(q0 - p0) < alpha) && (abs(p1 - p0) < beta) && (abs(q1 - q0) < beta) __m64 second_mask = _mm_and_si64 (first_mask,mask3); __m64 third_mask = _mm_and_si64 (first_mask,mask4); __m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) __m64 mlshift = _mm_set_pi16(0,0,0,2); __m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2; __m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1) __m64 madd4 = _mm_set_pi16(4,4,4,4); __m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4; __m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 + inter_2) __m64 mrshift3 = _mm_set_pi16(0,0,0,3); __m64 minter5 = _mm_sra_pi16(minter_4, mrshift3); //Clip3 __m64 m_tc0 = _mm_set_pi16(tc0,tc0,tc0,tc0); __m64 m_tcb0 = _mm_set_pi16(-tc0,-tc0,-tc0,-tc0); __m64 mres_c1 = _mm_min_pi16(_mm_max_pi16(mp1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 ); __m64 mres_c2 = _mm_min_pi16(_mm_max_pi16(mq1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addq2 - q1 ); __m64 merror0 = _mm_and_si64 (mres_c1,second_mask); __m64 merror1 = _mm_and_si64 (mres_c2,third_mask); __m64 m_1 = _mm_set_pi16(1,1,1,1); __m64 m_and1 = _mm_and_si64 (mask3, m_1); //tc++; if abs( p2 - p0 ) < beta __m64 m_and2 = _mm_and_si64 (mask4, m_1); //tc++; if abs( q2 - q0 ) < beta __m64 m_tc = _mm_add_pi16(m_and2,_mm_add_pi16(m_tc0,m_and1)); __m64 m_tcn =_mm_sub_pi16(_mm_sub_pi16(m_tcb0,m_and1),m_and2); __m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcn),m_tc); //CLIP3(-tc0, tc0, addp2 - p1 ); __m64 merror2 = _mm_and_si64 (mres_c3,first_mask); __m64 result_p1 = _mm_add_pi16(merror0,mp1); //_mm_shuffle_pi16(_mm_add_pi16(merror0,mp1), 0x1B); __m64 result_q1 = _mm_add_pi16(merror1,mq1); //_mm_shuffle_pi16(_mm_add_pi16(merror1,mq1), 0x1B); __m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B); __m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B); *((unsigned int* )(&pix[-2*xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p1,mrshift)); *((unsigned int* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift)); *((unsigned int* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift)); *((unsigned int* )(&pix[xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q1,mrshift)); empty(); }
void weak_horizontal_chroma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha , const unsigned char beta, const unsigned char tc0) { __m64 mp1 = _mm_set_pi16( 0,0,pix[-2*xstride + 1], pix[-2*xstride]); __m64 mp0 = _mm_set_pi16( 0,0,pix[-1*xstride + 1], pix[-1*xstride]); __m64 mq0 = _mm_set_pi16( 0,0,pix[1], pix[0]); __m64 mq1 = _mm_set_pi16( 0,0,pix[xstride + 1], pix[xstride]); __m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0) __m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0) __m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0) //To calculate the mask __m64 malpha = _mm_set_pi16(0,0,alpha,alpha); __m64 malphab = _mm_set_pi16(0,0,-alpha,-alpha); __m64 mbeta = _mm_set_pi16(0,0,beta,beta); __m64 mbetab = _mm_set_pi16(0,0,-beta,-beta); __m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab)); __m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab)); __m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab)); __m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2); __m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) __m64 mlshift = _mm_set_pi16(0,0,0,2); __m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2; __m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1) __m64 madd4 = _mm_set_pi16(4,4,4,4); __m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4; __m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 + inter_2) __m64 mrshift3 = _mm_set_pi16(0,0,0,3); __m64 minter5 = _mm_sra_pi16(minter_4, mrshift3); //Clip3 __m64 m_tc0 = _mm_set_pi16(0,0,tc0,tc0); __m64 m_tcb0 = _mm_set_pi16(0,0,-tc0,-tc0); __m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 ); __m64 merror2 = _mm_and_si64 (mres_c3,first_mask); __m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B); __m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B); __m64 mrshift = _mm_set_pi16(0,0,0,1); *((unsigned short* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift)); *((unsigned short* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift)); empty(); }
void AudioMixer::addBufferToMixForListeningNodeWithBuffer(PositionalAudioRingBuffer* bufferToAdd, AvatarAudioRingBuffer* listeningNodeBuffer) { float bearingRelativeAngleToSource = 0.0f; float attenuationCoefficient = 1.0f; int numSamplesDelay = 0; float weakChannelAmplitudeRatio = 1.0f; if (bufferToAdd != listeningNodeBuffer) { // if the two buffer pointers do not match then these are different buffers glm::vec3 relativePosition = bufferToAdd->getPosition() - listeningNodeBuffer->getPosition(); glm::quat inverseOrientation = glm::inverse(listeningNodeBuffer->getOrientation()); float distanceSquareToSource = glm::dot(relativePosition, relativePosition); float radius = 0.0f; if (bufferToAdd->getType() == PositionalAudioRingBuffer::Injector) { InjectedAudioRingBuffer* injectedBuffer = (InjectedAudioRingBuffer*) bufferToAdd; radius = injectedBuffer->getRadius(); attenuationCoefficient *= injectedBuffer->getAttenuationRatio(); } if (radius == 0 || (distanceSquareToSource > radius * radius)) { // this is either not a spherical source, or the listener is outside the sphere if (radius > 0) { // this is a spherical source - the distance used for the coefficient // needs to be the closest point on the boundary to the source // ovveride the distance to the node with the distance to the point on the // boundary of the sphere distanceSquareToSource -= (radius * radius); } else { // calculate the angle delivery for off-axis attenuation glm::vec3 rotatedListenerPosition = glm::inverse(bufferToAdd->getOrientation()) * relativePosition; float angleOfDelivery = glm::angle(glm::vec3(0.0f, 0.0f, -1.0f), glm::normalize(rotatedListenerPosition)); const float MAX_OFF_AXIS_ATTENUATION = 0.2f; const float OFF_AXIS_ATTENUATION_FORMULA_STEP = (1 - MAX_OFF_AXIS_ATTENUATION) / 2.0f; float offAxisCoefficient = MAX_OFF_AXIS_ATTENUATION + (OFF_AXIS_ATTENUATION_FORMULA_STEP * (angleOfDelivery / PI_OVER_TWO)); // multiply the current attenuation coefficient by the calculated off axis coefficient attenuationCoefficient *= offAxisCoefficient; } glm::vec3 rotatedSourcePosition = inverseOrientation * relativePosition; const float DISTANCE_SCALE = 2.5f; const float GEOMETRIC_AMPLITUDE_SCALAR = 0.3f; const float DISTANCE_LOG_BASE = 2.5f; const float DISTANCE_SCALE_LOG = logf(DISTANCE_SCALE) / logf(DISTANCE_LOG_BASE); // calculate the distance coefficient using the distance to this node float distanceCoefficient = powf(GEOMETRIC_AMPLITUDE_SCALAR, DISTANCE_SCALE_LOG + (0.5f * logf(distanceSquareToSource) / logf(DISTANCE_LOG_BASE)) - 1); distanceCoefficient = std::min(1.0f, distanceCoefficient); // multiply the current attenuation coefficient by the distance coefficient attenuationCoefficient *= distanceCoefficient; // project the rotated source position vector onto the XZ plane rotatedSourcePosition.y = 0.0f; // produce an oriented angle about the y-axis bearingRelativeAngleToSource = glm::orientedAngle(glm::vec3(0.0f, 0.0f, -1.0f), glm::normalize(rotatedSourcePosition), glm::vec3(0.0f, 1.0f, 0.0f)); const float PHASE_AMPLITUDE_RATIO_AT_90 = 0.5; // figure out the number of samples of delay and the ratio of the amplitude // in the weak channel for audio spatialization float sinRatio = fabsf(sinf(bearingRelativeAngleToSource)); numSamplesDelay = SAMPLE_PHASE_DELAY_AT_90 * sinRatio; weakChannelAmplitudeRatio = 1 - (PHASE_AMPLITUDE_RATIO_AT_90 * sinRatio); } } // if the bearing relative angle to source is > 0 then the delayed channel is the right one int delayedChannelOffset = (bearingRelativeAngleToSource > 0.0f) ? 1 : 0; int goodChannelOffset = delayedChannelOffset == 0 ? 1 : 0; const int16_t* nextOutputStart = bufferToAdd->getNextOutput(); const int16_t* bufferStart = bufferToAdd->getBuffer(); int ringBufferSampleCapacity = bufferToAdd->getSampleCapacity(); int16_t correctBufferSample[2], delayBufferSample[2]; int delayedChannelIndex = 0; const int SINGLE_STEREO_OFFSET = 2; for (int s = 0; s < NETWORK_BUFFER_LENGTH_SAMPLES_STEREO; s += 4) { // setup the int16_t variables for the two sample sets correctBufferSample[0] = nextOutputStart[s / 2] * attenuationCoefficient; correctBufferSample[1] = nextOutputStart[(s / 2) + 1] * attenuationCoefficient; delayedChannelIndex = s + (numSamplesDelay * 2) + delayedChannelOffset; delayBufferSample[0] = correctBufferSample[0] * weakChannelAmplitudeRatio; delayBufferSample[1] = correctBufferSample[1] * weakChannelAmplitudeRatio; __m64 bufferSamples = _mm_set_pi16(_clientSamples[s + goodChannelOffset], _clientSamples[s + goodChannelOffset + SINGLE_STEREO_OFFSET], _clientSamples[delayedChannelIndex], _clientSamples[delayedChannelIndex + SINGLE_STEREO_OFFSET]); __m64 addedSamples = _mm_set_pi16(correctBufferSample[0], correctBufferSample[1], delayBufferSample[0], delayBufferSample[1]); // perform the MMX add (with saturation) of two correct and delayed samples __m64 mmxResult = _mm_adds_pi16(bufferSamples, addedSamples); int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult); // assign the results from the result of the mmx arithmetic _clientSamples[s + goodChannelOffset] = shortResults[3]; _clientSamples[s + goodChannelOffset + SINGLE_STEREO_OFFSET] = shortResults[2]; _clientSamples[delayedChannelIndex] = shortResults[1]; _clientSamples[delayedChannelIndex + SINGLE_STEREO_OFFSET] = shortResults[0]; } // The following code is pretty gross and redundant, but AFAIK it's the best way to avoid // too many conditionals in handling the delay samples at the beginning of _clientSamples. // Basically we try to take the samples in batches of four, and then handle the remainder // conditionally to get rid of the rest. const int DOUBLE_STEREO_OFFSET = 4; const int TRIPLE_STEREO_OFFSET = 6; if (numSamplesDelay > 0) { // if there was a sample delay for this buffer, we need to pull samples prior to the nextOutput // to stick at the beginning float attenuationAndWeakChannelRatio = attenuationCoefficient * weakChannelAmplitudeRatio; const int16_t* delayNextOutputStart = nextOutputStart - numSamplesDelay; if (delayNextOutputStart < bufferStart) { delayNextOutputStart = bufferStart + ringBufferSampleCapacity - numSamplesDelay; } int i = 0; while (i + 3 < numSamplesDelay) { // handle the first cases where we can MMX add four samples at once int parentIndex = i * 2; __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset], _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset], _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset], _clientSamples[parentIndex + TRIPLE_STEREO_OFFSET + delayedChannelOffset]); __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 2] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 3] * attenuationAndWeakChannelRatio); __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples); int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult); _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3]; _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2]; _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[1]; _clientSamples[parentIndex + TRIPLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[0]; // push the index i += 4; } int parentIndex = i * 2; if (i + 2 < numSamplesDelay) { // MMX add only three delayed samples __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset], _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset], _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset], 0); __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 2] * attenuationAndWeakChannelRatio, 0); __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples); int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult); _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3]; _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2]; _clientSamples[parentIndex + DOUBLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[1]; } else if (i + 1 < numSamplesDelay) { // MMX add two delayed samples __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset], _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset], 0, 0); __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio, delayNextOutputStart[i + 1] * attenuationAndWeakChannelRatio, 0, 0); __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples); int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult); _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3]; _clientSamples[parentIndex + SINGLE_STEREO_OFFSET + delayedChannelOffset] = shortResults[2]; } else if (i < numSamplesDelay) { // MMX add a single delayed sample __m64 bufferSamples = _mm_set_pi16(_clientSamples[parentIndex + delayedChannelOffset], 0, 0, 0); __m64 addSamples = _mm_set_pi16(delayNextOutputStart[i] * attenuationAndWeakChannelRatio, 0, 0, 0); __m64 mmxResult = _mm_adds_pi16(bufferSamples, addSamples); int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult); _clientSamples[parentIndex + delayedChannelOffset] = shortResults[3]; } } }