unsigned int interpolhline_mmx_2(unsigned char* image){ __m64 mm_A = _mm_set_pi16(image[1],image[0],image[-1],image[-2]); __m64 mm_B = _mm_set_pi16(image[2],image[1],image[0],image[-1]); __m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); __m64 mm_D = _mm_set_pi16(image[4],image[3],image[2],image[1]); __m64 mm_E = _mm_set_pi16(image[5],image[4],image[3],image[2]); __m64 mm_F = _mm_set_pi16(image[6],image[5],image[4],image[3]); __m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F __m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16 __m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E __m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D __m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2 __m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E) __m64 mm_5 = _mm_set_pi16(5,5,5,5); __m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5 __m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5 __m64 mm_zero = _mm_setzero_si64(); __m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0 __m64 mm_ret = _mm_srai_pi16(mm_clip,5); __m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255 __m64 result =_mm_packs_pu16(mm_clip1,mm_zero); unsigned int ret = _mm_cvtsi64_si32(result); empty(); return ret; }
void pix_subtract :: processYUV_MMX (imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 null64 = _mm_setzero_si64(); __m64 offset = _mm_setr_pi16(0x80, 0x00, 0x80, 0x00); __m64 l0, l1, r0, r1; while (datasize--) { l1=leftPix[datasize]; r1=rightPix[datasize]; l0=_mm_unpacklo_pi8 (l1, null64); r0=_mm_unpacklo_pi8 (r1, null64); l1=_mm_unpackhi_pi8 (l1, null64); r1=_mm_unpackhi_pi8 (r1, null64); l0=_mm_adds_pu16(l0, offset); l1=_mm_adds_pu16(l1, offset); l0=_mm_subs_pu16(l0, r0); l1=_mm_subs_pu16(l1, r1); leftPix[datasize]=_mm_packs_pu16(l0, l1); } _mm_empty(); }
unsigned int interpolvline_mmx_3(unsigned char* image, int PicWidthInPix){ __m64 mm_A = _mm_set_pi16(image[-2 * PicWidthInPix + 3],image[-2 * PicWidthInPix + 2],image[-2 * PicWidthInPix + 1],image[-2 * PicWidthInPix]); __m64 mm_B = _mm_set_pi16(image[-1 * PicWidthInPix + 3],image[-1 * PicWidthInPix + 2],image[-1 * PicWidthInPix + 1],image[-1 * PicWidthInPix]); __m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); __m64 mm_D = _mm_set_pi16(image[1 * PicWidthInPix + 3],image[1 * PicWidthInPix + 2],image[1 * PicWidthInPix + 1],image[1 * PicWidthInPix]); __m64 mm_E = _mm_set_pi16(image[2 * PicWidthInPix + 3],image[2 * PicWidthInPix + 2],image[2 * PicWidthInPix + 1],image[2 * PicWidthInPix]); __m64 mm_F = _mm_set_pi16(image[3 * PicWidthInPix + 3],image[3 * PicWidthInPix + 2],image[3 * PicWidthInPix + 1],image[3 * PicWidthInPix]); __m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F __m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16 __m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E __m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D __m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2 __m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E) __m64 mm_5 = _mm_set_pi16(5,5,5,5); __m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5 __m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5 __m64 mm_zero = _mm_setzero_si64(); __m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0 __m64 mm_ret = _mm_srai_pi16(mm_clip,5); __m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255 __m64 test = _mm_avg_pu8(mm_clip1,mm_D);//(ptr_img[0] + ptr_rf[0] + 1) >> 1 __m64 test1 =_mm_packs_pu16(test,mm_zero); unsigned int ret = _mm_cvtsi64_si32(test1); empty(); return ret; }
void pix_multiply :: processRGBA_MMX(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 l0, r0, l1, r1; __m64 null64 = _mm_setzero_si64(); while(datasize--) { l1=leftPix [datasize]; r1=rightPix[datasize]; l0=_mm_unpacklo_pi8(l1, null64); r0=_mm_unpacklo_pi8(r1, null64); l1=_mm_unpackhi_pi8(l1, null64); r1=_mm_unpackhi_pi8(r1, null64); l0=_mm_mullo_pi16 (l0, r0); l1=_mm_mullo_pi16 (l1, r1); l0=_mm_srli_pi16(l0, 8); l1=_mm_srli_pi16(l1, 8); leftPix[datasize]=_mm_packs_pu16(l0, l1); } _mm_empty(); }
int main() { __m64 zero = _mm_setzero_si64(); __m64 mask = _mm_cmpeq_pi8( zero, zero ); mask = _mm_unpacklo_pi8( mask, zero ); global_mask = mask; return 0; }
__m64 interpolhline64_3_mmx(__m64* temp){ __m64 res,res1; __m64 ptr = _mm_setzero_si64(); __m64 mm_16 = _mm_set_pi16(16,16,16,16); short A = _mm_extract_pi16(temp[0],0); short B = _mm_extract_pi16(temp[1],0); short C = _mm_extract_pi16(temp[2],0); short D = _mm_extract_pi16(temp[3],0); short E = _mm_extract_pi16(temp[4],0); short F = _mm_extract_pi16(temp[5],0); unsigned int result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),0); A = _mm_extract_pi16(temp[0],1); B = _mm_extract_pi16(temp[1],1); C = _mm_extract_pi16(temp[2],1); D = _mm_extract_pi16(temp[3],1); E = _mm_extract_pi16(temp[4],1); F = _mm_extract_pi16(temp[5],1); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),1); A = _mm_extract_pi16(temp[0],2); B = _mm_extract_pi16(temp[1],2); C = _mm_extract_pi16(temp[2],2); D = _mm_extract_pi16(temp[3],2); E = _mm_extract_pi16(temp[4],2); F = _mm_extract_pi16(temp[5],2); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),2); A = _mm_extract_pi16(temp[0],3); B = _mm_extract_pi16(temp[1],3); C = _mm_extract_pi16(temp[2],3); D = _mm_extract_pi16(temp[3],3); E = _mm_extract_pi16(temp[4],3); F = _mm_extract_pi16(temp[5],3); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),3); res = _mm_add_pi16(temp[3],mm_16); res1 = _mm_srai_pi16(res,5); res1 = _mm_max_pi16(res1,_mm_set_pi16(0,0,0,0)); res1 = _mm_min_pi16(res1,_mm_set_pi16(255,255,255,255)); //Clip res = _mm_avg_pu16(ptr,res1);//(ptr_img[0] + ptr_rf[0] + 1) >> 1*/ return res; }
mlib_status mlib_m_sconv3x3_16nw_1( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, vker0, vker1, vker2; __m64 s0, s1, s2, v0, v1, aa, bb, rr, rh, rl; __m64 *sp0, *sp1, *sp2, *dp; __m64 zero, _rnd; mlib_s32 shift, kerh_sum; mlib_s32 i, j; width -= 2; height -= 2; width *= NCHAN; dl += dll + NCHAN; GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); dp = (__m64 *) dl; PREP_V(); for (i = 0; i < width / 4; i++) { CONV_3x3(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_3x3(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
void pix_movement :: processGrayMMX(imageStruct &image) { // assume that the pix_size does not change ! bool doclear=(image.xsize*image.ysize != buffer.xsize*buffer.ysize); buffer.xsize = image.xsize; buffer.ysize = image.ysize; buffer.reallocate(); if(doclear) { buffer.setWhite(); } buffer2.xsize = image.xsize; buffer2.ysize = image.ysize; buffer2.reallocate(); int pixsize = image.ysize * image.xsize / sizeof(__m64); unsigned char thresh=threshold; __m64*rp = (__m64*)image.data; // read pointer __m64*wp = (__m64*)buffer.data; // write pointer to the copy __m64*wp2= (__m64*)buffer2.data; // write pointer to the diff-image __m64 m1, m2, grey; __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh, thresh,thresh,thresh,thresh); // there is still one problem with the threshold: is the cmpgt only for signed ? while(pixsize--) { grey = rp[pixsize]; // image.data m2 = wp[pixsize]; // buffer.data //m0 =_mm_cmpgt_pi8(grey, m2); // (grey>m2) //m1 =_mm_subs_pu8 (grey, m2); // (grey-m2) //m2 =_mm_subs_pu8 (m2, grey); // (m2-grey) //m1 =_mm_and_si64 (m1, m0); // (m2-grey)&(grey>m2) ((??)) //m0 =_mm_andnot_si64(m0, m2); // !(grey>m2)&(grey-m2) ((??)) //m2 =_mm_or_si64 (m2, m0); // [(a-b)&(a>b)]|[(b-a)&!(a>b)]=abs(a-b) // this is better: use saturated arithmetic! m1 =_mm_subs_pu8 (grey, m2); // (grey-m2) m2 =_mm_subs_pu8 (m2, grey); // (m2-grey) wp[pixsize]=grey; // buffer.data m2 = _mm_or_si64 (m2, m1); // |grey-m2| m2 =_mm_subs_pu8 (m2, thresh8); m2 =_mm_cmpgt_pi8(m2, _mm_setzero_si64()); wp2[pixsize]=m2; // output.data } _mm_empty(); image.data = buffer2.data; }
unsigned int interpolvline64_2_mmx(__m64* temp){ __m64 res; __m64 ptr = _mm_setzero_si64(); short A = _mm_extract_pi16(temp[0],0); short B = _mm_extract_pi16(temp[1],0); short C = _mm_extract_pi16(temp[2],0); short D = _mm_extract_pi16(temp[3],0); short E = _mm_extract_pi16(temp[4],0); short F = _mm_extract_pi16(temp[5],0); unsigned int result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),0); A = _mm_extract_pi16(temp[0],1); B = _mm_extract_pi16(temp[1],1); C = _mm_extract_pi16(temp[2],1); D = _mm_extract_pi16(temp[3],1); E = _mm_extract_pi16(temp[4],1); F = _mm_extract_pi16(temp[5],1); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),1); A = _mm_extract_pi16(temp[0],2); B = _mm_extract_pi16(temp[1],2); C = _mm_extract_pi16(temp[2],2); D = _mm_extract_pi16(temp[3],2); E = _mm_extract_pi16(temp[4],2); F = _mm_extract_pi16(temp[5],2); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),2); A = _mm_extract_pi16(temp[0],3); B = _mm_extract_pi16(temp[1],3); C = _mm_extract_pi16(temp[2],3); D = _mm_extract_pi16(temp[3],3); E = _mm_extract_pi16(temp[4],3); F = _mm_extract_pi16(temp[5],3); result = A + F - 5 * (short)(B + E) + 20 * (short)(C + D) + 512; ptr = _mm_insert_pi16(ptr,CLIP255_16(result >> 10),3); res = _mm_set_pi8(0,0,0,0,_mm_extract_pi16(ptr,3),_mm_extract_pi16(ptr,2),_mm_extract_pi16(ptr,1),_mm_extract_pi16(ptr,0)); result = _mm_cvtsi64_si32(res); empty(); return result; }
void weighted_merge_planar_mmx(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height, int weight, int invweight) { __m64 round_mask = _mm_set1_pi32(0x4000); __m64 zero = _mm_setzero_si64(); __m64 mask = _mm_set_pi16(weight, invweight, weight, invweight); int wMod8 = (width/8) * 8; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod8; x += 8) { __m64 px1 = *(reinterpret_cast<const __m64*>(p1+x)); //y7y6 y5y4 y3y2 y1y0 __m64 px2 = *(reinterpret_cast<const __m64*>(p2+x)); //Y7Y6 Y5Y4 Y3Y2 Y1Y0 __m64 p0123 = _mm_unpacklo_pi8(px1, px2); //Y3y3 Y2y2 Y1y1 Y0y0 __m64 p4567 = _mm_unpackhi_pi8(px1, px2); //Y7y7 Y6y6 Y5y5 Y4y4 __m64 p01 = _mm_unpacklo_pi8(p0123, zero); //00Y1 00y1 00Y0 00y0 __m64 p23 = _mm_unpackhi_pi8(p0123, zero); //00Y3 00y3 00Y2 00y2 __m64 p45 = _mm_unpacklo_pi8(p4567, zero); //00Y5 00y5 00Y4 00y4 __m64 p67 = _mm_unpackhi_pi8(p4567, zero); //00Y7 00y7 00Y6 00y6 p01 = _mm_madd_pi16(p01, mask); p23 = _mm_madd_pi16(p23, mask); p45 = _mm_madd_pi16(p45, mask); p67 = _mm_madd_pi16(p67, mask); p01 = _mm_add_pi32(p01, round_mask); p23 = _mm_add_pi32(p23, round_mask); p45 = _mm_add_pi32(p45, round_mask); p67 = _mm_add_pi32(p67, round_mask); p01 = _mm_srli_pi32(p01, 15); p23 = _mm_srli_pi32(p23, 15); p45 = _mm_srli_pi32(p45, 15); p67 = _mm_srli_pi32(p67, 15); p0123 = _mm_packs_pi32(p01, p23); p4567 = _mm_packs_pi32(p45, p67); __m64 result = _mm_packs_pu16(p0123, p4567); *reinterpret_cast<__m64*>(p1+x) = result; } for (int x = wMod8; x < width; x++) { p1[x] = (p1[x]*invweight + p2[x]*weight + 16384) >> 15; } p1 += p1_pitch; p2 += p2_pitch; } _mm_empty(); }
void pix_background :: processRGBAMMX(imageStruct &image) { long i,pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); } m_reset=0; i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Yrange, m_Urange, m_Vrange, m_Arange, m_Yrange, m_Urange, m_Vrange, m_Arange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); __m64 newpix, oldpix, m1; while(i--){ /* 7ops, 3memops */ /* i have the feeling that this is not faster at all! * even if i have the 3memops + ONLY 1 _mm_subs_pu8() * i am equally slow as the generic code; * adding the other instruction does not change much */ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); m1 = _mm_subs_pu8 (m1, thresh); m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh m1 = _mm_andnot_si64(m1, newpix); *data++ = m1; } _mm_empty(); }
int32_t od_mc_compute_satd8_4x4_sse2(const unsigned char *src, int systride, const unsigned char *ref, int rystride) { int32_t satd; __m64 sums; __m64 a; __m64 b; __m64 c; __m64 d; a = od_load_convert_subtract_x4(src + 0*systride, ref + 0*rystride); b = od_load_convert_subtract_x4(src + 1*systride, ref + 1*rystride); c = od_load_convert_subtract_x4(src + 2*systride, ref + 2*rystride); d = od_load_convert_subtract_x4(src + 3*systride, ref + 3*rystride); /*Vertical 1D transform.*/ od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); od_transpose16x4(&a, &b, &c, &d); /*Horizontal 1D transform.*/ od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); /*Use the fact that (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) to merge the final butterfly stage with the calculating the absolute values and the first stage of accumulation. Calculates (abs(a+b)+abs(a-b))/2-0x7FFF. An offset must be added to the final sum before rounding to account for subtracting 0x7FFF.*/ a = _mm_sub_pi16(_mm_max_pi16(a, b), _mm_adds_pi16(_mm_add_pi16(a, b), _mm_set1_pi16(0x7FFF))); c = _mm_sub_pi16(_mm_max_pi16(c, d), _mm_adds_pi16(_mm_add_pi16(c, d), _mm_set1_pi16(0x7FFF))); /*Take the sum of all the absolute values.*/ sums = _mm_add_pi16(a, c); /*Sum the elements of the vector.*/ sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(0, 1, 2, 3))); sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(2, 3, 0, 1))); sums = _mm_unpacklo_pi16(sums, _mm_setzero_si64()); satd = _mm_cvtsi64_si32(sums); /*Subtract the offset (8) and round.*/ satd = (satd + 1 - 8) >> 1; #if defined(OD_CHECKASM) { int32_t c_satd; c_satd = od_mc_compute_satd8_4x4_c(src, systride, ref, rystride); if (satd != c_satd) { fprintf(stderr, "od_mc_compute_satd %ix%i check failed: %i!=%i\n", 4, 4, satd, c_satd); } } #endif return satd; }
mlib_status mlib_m_sconv5x5_u16nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, hker3, hker4; __m64 vker0, vker1, vker2, vker3, vker4; __m64 s0, s1, s2, s3, s4, v0, v1, v2, v3, v4, rr, rh, rl; __m64 zero, ker_off, mask8000; __m64 *sp0, *sp1, *sp2, *sp3, *sp4, *dp; mlib_s32 shift, ker_sum, kerh_sum = 0, kerv_sum = 0; mlib_s32 i, j; width -= 4; height -= 4; dl += 2 * (dll + NCHAN); GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); sp3 = (__m64 *) (sl + 3 * sll); sp4 = (__m64 *) (sl + 4 * sll); dp = (__m64 *) dl; PREP_V(); for (i = 0; i < width; i++) { CONV_5x5(); dp[i] = rr; } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
void DrawAAPMMX(PixelBlock& w, int x, int y, Color c) { if(!Rect(w.GetSize()).Contains(Rect(x, y, x + 6, y + 11))) return; dword *a = w.PointAdr(x, y); int d = w.LineDelta(); __m64 zero = _mm_setzero_si64(); __m64 mc = _mm_unpacklo_pi8(_mm_cvtsi32_si64(c.GetRaw()), zero); __m64 mask = _mm_set1_pi16(0xff); const byte *s = aa_packed; dword *t = a; __m64 alpha; __m64 h; __m64 m; for(;;) { dword c = *s++; if(c == 0) break; t += (c >> 3) & 15; switch(c & 7) { case 7: AAPMMX_(6); case 6: AAPMMX_(5); case 5: AAPMMX_(4); case 4: AAPMMX_(3); case 3: AAPMMX_(2); case 2: AAPMMX_(1); case 1: AAPMMX_(0); } t += c & 7; s += c & 7; if(c & 0x80) { a += d; t = a; } } _mm_empty(); }
/* do the processing for all colourspaces */ void pix_motionblur :: processMMX(imageStruct &image) { m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); int pixsize=image.ysize*image.xsize*image.csize; pixsize=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*pixels=(__m64*)image.data; __m64*old=(__m64*)m_savedImage.data; __m64 newGain = _mm_set1_pi16(static_cast<short>(m_blur0)); __m64 oldGain = _mm_set1_pi16(static_cast<short>(m_blur1)); __m64 null64 = _mm_setzero_si64(); __m64 newpix1, newpix2, oldpix1, oldpix2; while(pixsize--) { newpix1=pixels[pixsize]; oldpix1=old[pixsize]; newpix2 = _mm_unpackhi_pi8(newpix1, null64); newpix1 = _mm_unpacklo_pi8(newpix1, null64); oldpix2 = _mm_unpackhi_pi8(oldpix1, null64); oldpix1 = _mm_unpacklo_pi8(oldpix1, null64); newpix1 = _mm_mullo_pi16(newpix1, newGain); newpix2 = _mm_mullo_pi16(newpix2, newGain); oldpix1 = _mm_mullo_pi16(oldpix1, oldGain); oldpix2 = _mm_mullo_pi16(oldpix2, oldGain); newpix1 = _mm_adds_pu16 (newpix1, oldpix1); newpix2 = _mm_adds_pu16 (newpix2, oldpix2); newpix1 = _mm_srli_pi16(newpix1, 8); newpix2 = _mm_srli_pi16(newpix2, 8); newpix1 = _mm_packs_pu16(newpix1, newpix2); pixels[pixsize]=newpix1; old [pixsize]=newpix1; } _mm_empty(); }
void pix_background :: processGrayMMX(imageStruct &image){ int i; long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); } m_reset=0; if(m_Yrange==0)return; __m64*npixes=(__m64*)image.data; __m64*opixes=(__m64*)m_savedImage.data; __m64 newpix, oldpix, m1; unsigned char thresh=m_Yrange-1; __m64 thresh8=_mm_set_pi8(thresh,thresh,thresh,thresh, thresh,thresh,thresh,thresh); i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); while(i--){ newpix=npixes[i]; oldpix=opixes[i]; m1 = _mm_subs_pu8 (newpix, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_subs_pu8 (m1, thresh8); m1 = _mm_cmpgt_pi8(m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh8 npixes[i] = _mm_and_si64(m1, newpix); } _mm_empty(); }
__m64 unsigned_add3 (const __m64 * a, const __m64 * b, __m64 * result, unsigned int count) { __m64 _a, _b, one, sum, carry, onesCarry; unsigned int i; carry = _mm_setzero_si64 (); one = _mm_cmpeq_pi8 (carry, carry); one = _mm_sub_si64 (carry, one); for (i = 0; i < count; i++) { _a = a[i]; _b = b[i]; sum = _mm_add_si64 (_a, _b); sum = _mm_add_si64 (sum, carry); result[i] = sum; onesCarry = _mm_and_si64 (_mm_xor_si64 (_a, _b), carry); onesCarry = _mm_or_si64 (_mm_and_si64 (_a, _b), onesCarry); onesCarry = _mm_and_si64 (onesCarry, one); _a = _mm_srli_si64 (_a, 1); _b = _mm_srli_si64 (_b, 1); carry = _mm_add_si64 (_mm_add_si64 (_a, _b), onesCarry); carry = _mm_srli_si64 (carry, 63); } return carry; }
void pix_background :: processYUVMMX(imageStruct &image) { long pixsize; pixsize = image.xsize * image.ysize * image.csize; if(m_savedImage.xsize!=image.xsize || m_savedImage.ysize!=image.ysize || m_savedImage.format!=image.format)m_reset=1; m_savedImage.xsize=image.xsize; m_savedImage.ysize=image.ysize; m_savedImage.setCsizeByFormat(image.format); m_savedImage.reallocate(); if (m_reset){ memcpy(m_savedImage.data,image.data,pixsize); // return; } m_reset=0; int i=pixsize/sizeof(__m64)+(pixsize%sizeof(__m64)!=0); __m64*data =(__m64*)image.data; __m64*saved=(__m64*)m_savedImage.data; const __m64 thresh=_mm_set_pi8(m_Urange, m_Yrange, m_Vrange, m_Yrange, m_Urange, m_Yrange, m_Vrange, m_Yrange); const __m64 offset=_mm_set_pi8(1, 1, 1, 1, 1, 1, 1, 1); const __m64 black =_mm_set_pi8((unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80, (unsigned char)0x00, (unsigned char)0x80); __m64 newpix, oldpix, m1; while(i--){ newpix=*data; oldpix=*saved++; m1 = newpix; m1 = _mm_subs_pu8 (m1, oldpix); oldpix= _mm_subs_pu8 (oldpix, newpix); m1 = _mm_or_si64 (m1, oldpix); // |oldpix-newpix| m1 = _mm_adds_pu8 (m1, offset); // to make thresh=0 work correctly m1 = _mm_subs_pu8 (m1, thresh); // m1>thresh -> saturation -> 0 m1 = _mm_cmpeq_pi32 (m1, _mm_setzero_si64()); // |oldpix-newpix|>thresh oldpix= black; oldpix= _mm_and_si64 (oldpix, m1); m1 = _mm_andnot_si64 (m1, newpix); m1 = _mm_or_si64 (m1, oldpix); *data++ = m1; } _mm_empty(); }
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix) { const unsigned char *u = src + ((y-1) & 7)*sc2lines_width, *m = src + ((y+0) & 7)*sc2lines_width, *l = src + ((y+1) & 7)*sc2lines_width; for (unsigned i = 0; i < nPix; i += 4) { if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) { __m64 mm = *(__m64*)(m+i-2); __m64 uu = *(__m64*)(u+i-2); __m64 ll = *(__m64*)(l+i-2); __m64 md = _mm_slli_si64(mm,8); __m64 mf = _mm_srli_si64(mm,8); __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll)); __m64 e0, e1, v1, v2; e0 = _mm_cmpeq_pi8(md,uu); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,uu); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(u+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); e0 = _mm_cmpeq_pi8(md,ll); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,ll); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(l+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); } else { __m64 v1 = _m_from_int(*(unsigned*)(m+i)); v1 = _mm_unpacklo_pi8(v1,v1); *(__m64*)(dst1 + 2*i) = v1; *(__m64*)(dst2 + 2*i) = v1; } } }
void pix_multiply :: processYUV_MMX(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 l0, r0, l1, r1; __m64 mask= _mm_setr_pi8((unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00, (unsigned char)0xFF, (unsigned char)0x00); __m64 yuvclamp0 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10, (unsigned char)0x00, (unsigned char)0x10); __m64 yuvclamp1 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24, (unsigned char)0x00, (unsigned char)0x24); __m64 yuvclamp2 = _mm_setr_pi8((unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14, (unsigned char)0x00, (unsigned char)0x14); __m64 null64 = _mm_setzero_si64(); while(datasize--) { r1=rightPix[datasize]; l1=leftPix [datasize]; r1=_mm_or_si64(r1, mask); l0=_mm_unpacklo_pi8(l1, null64); r0=_mm_unpacklo_pi8(r1, null64); l1=_mm_unpackhi_pi8(l1, null64); r1=_mm_unpackhi_pi8(r1, null64); l0=_mm_mullo_pi16 (l0, r0); l1=_mm_mullo_pi16 (l1, r1); l0=_mm_srli_pi16(l0, 8); l1=_mm_srli_pi16(l1, 8); l0=_mm_packs_pu16(l0, l1); l0=_mm_subs_pu8(l0, yuvclamp0); l0=_mm_adds_pu8(l0, yuvclamp1); l0=_mm_subs_pu8(l0, yuvclamp2); leftPix[datasize]=l0; } _mm_empty(); }
int main(int, char**) { __m64 a = _mm_setzero_si64(); a = _mm_shuffle_pi16(a, 0); return _m_to_int(a); }
int main(int, char**) { _mm_unpackhi_pi16(_mm_setzero_si64(), _mm_setzero_si64()); return 0; }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YV12 to RGB16. VOID Yv12ToRgb16_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcYDif; INT iSrcUvDif; INT yy, bu, guv, rv, r, g, b; M64 y0, y1, u0, v0, mz; M64 r0, g0, b0, r1, g1, b1; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbDstX += (iHeight - 1) * iDstXStride; iDstXStride = -iDstXStride; } iDstXDif = iDstXStride - (uWidth * 2); iSrcYDif = iSrcYStride - uWidth; iSrcUvDif = iSrcUvStride - (uWidth / 2); mz = _mm_setzero_si64(); for (y = iHeight / 2; y; y--) { for (x = uWidth / 8; x; x--) { // Calculate coefficient. u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU)); // [ | u3 u2 u1 u0] v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV)); // [ | v3 v2 v1 v0] u0 = _mm_unpacklo_pi8(u0, mz); // u3 u2 u1 u0 v0 = _mm_unpacklo_pi8(v0, mz); // v3 v2 v1 v0 u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y0 = *((PM64) pbSrcY); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); // 8 bits (0 - 7) b0 = _mm_srai_pi16(b0, SCALEBITS); b1 = _mm_packs_pu16(b1, mz); // 0 0 0 0 b7 b6 b5 b4 b0 = _mm_packs_pu16(b0, mz); // 0 0 0 0 b3 b2 b1 b0 b1 = _mm_unpacklo_pi8(b1, mz); // 0 b7 0b 6 0 b5 0 b4 b0 = _mm_unpacklo_pi8(b0, mz); b1 = _mm_srli_pi16(b1, 3); b0 = _mm_srli_pi16(b0, 3); // 5 bits (0 - 4) g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); // 8 bits (0 - 7) g0 = _mm_srai_pi16(g0, SCALEBITS); g1 = _mm_packs_pu16(g1, mz); // 0 0 0 0 g7 g6 g5 g4 g0 = _mm_packs_pu16(g0, mz); // 0 0 0 0 g3 g2 g1 g0 g1 = _mm_unpacklo_pi8(g1, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); g1 = _mm_srli_pi16(g1, 2); // 6 bits (0 - 5) g0 = _mm_srli_pi16(g0, 2); g1 = _mm_slli_pi16(g1, 5); // 6 bits (5 - 10) g0 = _mm_slli_pi16(g0, 5); // 6 bits (5 - 10) r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r1 = _mm_packs_pu16(r1, mz); // 0 0 0 0 r7 r6 r5 r4 r0 = _mm_packs_pu16(r0, mz); // 0 0 0 0 r3 r2 r1 r0 r1 = _mm_unpacklo_pi8(r1, mz); // 0 r7 0 r6 0 r5 0 r4 r0 = _mm_unpacklo_pi8(r0, mz); r1 = _mm_srli_pi16(r1, 3); // 5 bits (0 - 4) r0 = _mm_srli_pi16(r0, 3); r1 = _mm_slli_pi16(r1, 11); // 5 bits (11 - 15) r0 = _mm_slli_pi16(r0, 11); // 5 bits (11 - 15) // Combine RGB. b0 = _mm_or_si64(g0, b0); b0 = _mm_or_si64(r0, b0); // 16 bits rgb b1 = _mm_or_si64(g1, b1); b1 = _mm_or_si64(r1, b1); // 16 bits rgb // Write out row 0. ((PM64) pbDstX)[0] = b0; ((PM64) pbDstX)[1] = b1; // Process for row 1. y0 = *((PM64) (pbSrcY + iSrcYStride)); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); // 8 bits (0 - 7) b0 = _mm_srai_pi16(b0, SCALEBITS); b1 = _mm_packs_pu16(b1, mz); // 0 0 0 0 b7 b6 b5 b4 b0 = _mm_packs_pu16(b0, mz); // 0 0 0 0 b3 b2 b1 b0 b1 = _mm_unpacklo_pi8(b1, mz); // 0 b7 0b 6 0 b5 0 b4 b0 = _mm_unpacklo_pi8(b0, mz); b1 = _mm_srli_pi16(b1, 3); b0 = _mm_srli_pi16(b0, 3); // 5 bits (0 - 4) g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); // 8 bits (0 - 7) g0 = _mm_srai_pi16(g0, SCALEBITS); g1 = _mm_packs_pu16(g1, mz); // 0 0 0 0 g7 g6 g5 g4 g0 = _mm_packs_pu16(g0, mz); // 0 0 0 0 g3 g2 g1 g0 g1 = _mm_unpacklo_pi8(g1, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); g1 = _mm_srli_pi16(g1, 2); // 6 bits (0 - 5) g0 = _mm_srli_pi16(g0, 2); g1 = _mm_slli_pi16(g1, 5); // 6 bits (5 - 10) g0 = _mm_slli_pi16(g0, 5); // 6 bits (5 - 10) r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r1 = _mm_packs_pu16(r1, mz); // 0 0 0 0 r7 r6 r5 r4 r0 = _mm_packs_pu16(r0, mz); // 0 0 0 0 r3 r2 r1 r0 r1 = _mm_unpacklo_pi8(r1, mz); // 0 r7 0 r6 0 r5 0 r4 r0 = _mm_unpacklo_pi8(r0, mz); r1 = _mm_srli_pi16(r1, 3); // 5 bits (0 - 4) r0 = _mm_srli_pi16(r0, 3); r1 = _mm_slli_pi16(r1, 11); // 5 bits (11 - 15) r0 = _mm_slli_pi16(r0, 11); // 5 bits (11 - 15) // Combine RGB. b0 = _mm_or_si64(g0, b0); b0 = _mm_or_si64(r0, b0); // 16 bits rgb b1 = _mm_or_si64(g1, b1); b1 = _mm_or_si64(r1, b1); // 16 bits rgb // Write out row 1. ((PM64) (pbDstX + iDstXStride))[0] = b0; ((PM64) (pbDstX + iDstXStride))[1] = b1; pbDstX += 16; pbSrcY += 8; pbSrcU += 4; pbSrcV += 4; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcU[0]]; guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]]; rv = g_iRVTab[pbSrcV[0]]; yy = g_iYYTab[pbSrcY[0]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) pbDstX)[0] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[1]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) pbDstX)[1] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[iSrcYStride]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) (pbDstX + iDstXStride))[0] = _MakeRgb16(r, g, b); yy = g_iYYTab[pbSrcY[iSrcYStride + 1]]; b = _Clip(((yy + bu) >> SCALEBITS_OUT)); g = _Clip(((yy - guv) >> SCALEBITS_OUT)); r = _Clip(((yy + rv) >> SCALEBITS_OUT)); ((PWORD) (pbDstX + iDstXStride))[1] = _MakeRgb16(r, g, b); pbDstX += 4; pbSrcY += 2; pbSrcU++; pbSrcV++; } pbDstX += iDstXDif + iDstXStride; pbSrcY += iSrcYDif + iSrcYStride; pbSrcU += iSrcUvDif; pbSrcV += iSrcUvDif; } _mm_empty(); }
void test_char_to_float(void) { __m64 narrow = _mm_set_pi8(0, 1, 2, 3, 252, 253, 254, 255); __m64 zero = _mm_setzero_si64(); __m64 loshorts, hishorts; __m128 lofloats, hifloats; float lofloatarray[FLOAT_ARRAYSIZE] __attribute__ ((aligned (16))); float hifloatarray[FLOAT_ARRAYSIZE] __attribute__ ((aligned (16))); int16_t shortarray[SHORT_ARRAYSIZE] __attribute__ ((aligned (16))); int i; /* interleave zero with narrow and return halves: essentially widening */ /* elements from unsigned chars to unsigned shorts */ loshorts = _mm_unpacklo_pi8(narrow, zero); hishorts = _mm_unpackhi_pi8(narrow, zero); /* now turn the 4 shorts in loshorts into floats and store them in lofloats */ /* likewise hishorts into hifloats. */ /* bug in _mm_cvtpi16_ps ? */ lofloats = _mm_cvtpu16_ps1(loshorts); hifloats = _mm_cvtpu16_ps1(hishorts); _mm_store_ps(lofloatarray, lofloats); _mm_store_ps(hifloatarray, hifloats); /* we used SSE1 instructions that used __m64: add an _mm_empty */ _mm_empty(); /* now store loshorts in shortarray and lofloats into lofloatarray */ /* and print them. */ memcpy(shortarray, &loshorts, sizeof(shortarray)); fprintf(stderr, "loshorts "); for(i= 0; i < SHORT_ARRAYSIZE; i++) { fprintf(stderr, "%d ", shortarray[i]); } fprintf(stderr, "\n"); fprintf(stderr, "lofloats "); for(i= 0; i < FLOAT_ARRAYSIZE; i++) { fprintf(stderr, "%f ", lofloatarray[i]); } fprintf(stderr, "\n"); /* now store hishorts in shortarray and hifloats into lofloatarray */ /* and print them. */ memcpy(shortarray, &hishorts, sizeof(shortarray)); fprintf(stderr, "hishorts "); for(i= 0; i < SHORT_ARRAYSIZE; i++) { fprintf(stderr, "%d ", shortarray[i]); } fprintf(stderr, "\n"); fprintf(stderr, "hifloats "); for(i= 0; i < FLOAT_ARRAYSIZE; i++) { fprintf(stderr, "%f ", hifloatarray[i]); } fprintf(stderr, "\n"); }
void reverb::comb_allpass4(signed short *sp, signed short *dp, const comb_param &comb_delay, const int comb_gain, const int allpass_delay, const int allpass_gain, const int *rvol, const unsigned int sz) { #ifdef use_intrinsics __m64 cg=_mm_set1_pi16(comb_gain), ag=_mm_set1_pi16(allpass_gain), rv[2]; rv[0]=_mm_set1_pi16(rvol[0]); rv[1]=_mm_set1_pi16(rvol[1]); for (unsigned int i=0; i<(sz>>4); i++, sp+=2<<2, dp+=2<<2) { __m64 dv[2]; for (int c=0; c<2; c++) { // Comb __m64 v=_mm_setzero_si64(); for (int f=0; f<4; f++) { int yck=(yp-comb_delay[c][f])&(max_delay-1); __m64 xv=*(__m64 *)(&x[c][yck]), yv=*(__m64 *)(&y[c][f][yck]); yv=_mm_mulhi_pi16(yv,cg); yv=_mm_adds_pi16(yv,yv); yv=_mm_adds_pi16(xv,yv); *((__m64 *)&y[c][f][yp])=yv; yv=_mm_srai_pi16(yv,2); v=_mm_adds_pi16(v,yv); } // Allpass if (allpass_delay) { *((__m64 *)&ax[c][yp])=v; int ypa=(yp-allpass_delay)&(max_delay-1); __m64 ayv=*(__m64 *)&ay[c][ypa], xv=*(__m64 *)&x[c][yp], axv=*(__m64 *)&ax[c][ypa]; ayv=_mm_subs_pi16(ayv,xv); ayv=_mm_mulhi_pi16(ayv,ag); ayv=_mm_adds_pi16(ayv,ayv); v=_mm_adds_pi16(ayv,axv); *((__m64 *)&ay[c][yp])=v; } // Output dv[c]=_mm_mulhi_pi16(v,rv[c]); dv[c]=_mm_adds_pi16(dv[c],dv[c]); } __m64 dv1=_mm_unpacklo_pi16(dv[0],dv[1]), dv2=_mm_unpackhi_pi16(dv[0],dv[1]), d1=*(__m64 *)&dp[0], d2=*(__m64 *)&dp[4], s1=*(__m64 *)&sp[0], s2=*(__m64 *)&sp[4]; d1=_mm_adds_pi16(d1,s1); d2=_mm_adds_pi16(d2,s2); d1=_mm_adds_pi16(d1,dv1); d2=_mm_adds_pi16(d2,dv2); *(__m64 *)&dp[0]=d1; *(__m64 *)&dp[4]=d2; yp=(yp+4)&(max_delay-1); } _mm_empty(); #endif }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YUY2 to RGB24. VOID Yuy2ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcX, INT iSrcXStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcXDif; INT yy, bu, guv, rv; M64 y0, y1, u0, v0, uv_temp1, uv_temp2, mz; M64 r0, g0, b0, r1, g1, b1; M64 rgb0, rgb1, rgb2, rgb3; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbSrcX += (iHeight - 1) * iSrcXStride; iSrcXStride = -iSrcXStride; } iDstXDif = iDstXStride - (uWidth * 3); iSrcXDif = iSrcXStride - (uWidth * 2); mz = _mm_setzero_si64(); for (y = iHeight; y; y--) { for (x = uWidth / 8; x; x--) { y0 = ((PM64) pbSrcX)[0]; y1 = ((PM64) pbSrcX)[1]; u0 = y0; v0 = y1; y0 = _mm_and_si64(y0, g_mWord00FF); y1 = _mm_and_si64(y1, g_mWord00FF); u0 = _mm_srli_pi16(u0, 8); v0 = _mm_srli_pi16(v0, 8); uv_temp1 = _mm_srli_pi32(u0, 16); u0 = _mm_slli_pi32(u0, 16); u0 = _mm_srli_pi32(u0, 16); uv_temp2 = _mm_srli_pi32(v0, 16); v0 = _mm_slli_pi32(v0, 16); v0 = _mm_srli_pi32(v0, 16); u0 = _mm_packs_pi32(u0, v0); v0 = _mm_packs_pi32(uv_temp1, uv_temp2); // Calculate coefficient. u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 0. *((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3); pbDstX += 24; pbSrcX += 16; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcX[1]]; guv = g_iGUTab[pbSrcX[1]] + g_iGVTab[pbSrcX[3]]; rv = g_iRVTab[pbSrcX[3]]; yy = g_iYYTab[pbSrcX[0]]; pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcX[2]]; pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT); pbDstX += 6; pbSrcX += 4; } pbDstX += iDstXDif; pbSrcX += iSrcXDif; } _mm_empty(); }
/* almost the same as sin_ps */ __m128 cos_ps(v4sfu *xPtr) { // any x __m128 x=*((__m128 *)xPtr); __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, y; #ifdef USE_SSE2 __m128i emm0, emm2; #else __m64 mm0, mm1, mm2, mm3; #endif /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in mm0 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm2 = _mm_sub_epi32(emm2, *(__m128i*)_pi32_2); /* get the swap sign flag */ emm0 = _mm_andnot_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 sign_bit = _mm_castsi128_ps(emm0); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm0:mm1 */ xmm2 = _mm_movehl_ps(xmm2, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm2); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm2 = _mm_sub_pi32(mm2, *(__m64*)_pi32_2); mm3 = _mm_sub_pi32(mm3, *(__m64*)_pi32_2); /* get the swap sign flag in mm0:mm1 and the polynom selection mask in mm2:mm3 */ mm0 = _mm_andnot_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_andnot_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 sign_bit, poly_mask; COPY_MM_TO_XMM(mm0, mm1, sign_bit); COPY_MM_TO_XMM(mm2, mm3, poly_mask); _mm_empty(); /* good-bye mmx */ #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m128*)_ps_coscof_p0; __m128 z = _mm_mul_ps(x,x); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm_and_ps(xmm3, y2); //, xmm3); y = _mm_andnot_ps(xmm3, y); y = _mm_add_ps(y,y2); /* update the sign */ y = _mm_xor_ps(y, sign_bit); return y; }
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Convert YV12 to RGB24. VOID Yv12ToRgb24_mmx(PBYTE pbDstX, INT iDstXStride, PBYTE pbSrcY, PBYTE pbSrcU, PBYTE pbSrcV, INT iSrcYStride, INT iSrcUvStride, UINT uWidth, INT iHeight) { UINT x; INT y; INT iDstXDif; INT iSrcYDif; INT iSrcUvDif; INT yy, bu, guv, rv; M64 y0, y1, u0, v0, mz; M64 r0, g0, b0, r1, g1, b1; M64 rgb0, rgb1, rgb2, rgb3; M64 bu0, gu0, gv0, rv0, bu1, rv1, guv0, guv1; if (iHeight < 0) { iHeight = -iHeight; pbDstX += (iHeight - 1) * iDstXStride; iDstXStride = -iDstXStride; } iDstXDif = iDstXStride - (uWidth * 3); iSrcYDif = iSrcYStride - uWidth; iSrcUvDif = iSrcUvStride - (uWidth / 2); mz = _mm_setzero_si64(); for (y = iHeight / 2; y; y--) { for (x = uWidth / 8; x; x--) { // Calculate coefficient. u0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcU)); // [ | u3 u2 u1 u0] v0 = _mm_cvtsi32_si64(*((PDWORD) pbSrcV)); // [ | v3 v2 v1 v0] u0 = _mm_unpacklo_pi8(u0, mz); // u3 u2 u1 u0 v0 = _mm_unpacklo_pi8(v0, mz); // v3 v2 v1 v0 u0 = _mm_subs_pi16(u0, g_mSub80); v0 = _mm_subs_pi16(v0, g_mSub80); gu0 = _mm_mullo_pi16(u0, g_mUGMul); gv0 = _mm_mullo_pi16(v0, g_mVGMul); bu0 = _mm_mullo_pi16(u0, g_mUBMul); rv0 = _mm_mullo_pi16(v0, g_mVRMul); guv0 = _mm_adds_pi16(gu0, gv0); guv1 = _mm_unpackhi_pi16(guv0, guv0); // guv3 guv3 guv2 guv2 guv0 = _mm_unpacklo_pi16(guv0, guv0); // guv1 guv1 guv0 guv0 bu1 = _mm_unpackhi_pi16(bu0, bu0); // bu3 bu3 bu2 bu2 bu0 = _mm_unpacklo_pi16(bu0, bu0); // bu1 bu1 bu0 bu0 rv1 = _mm_unpackhi_pi16(rv0, rv0); // rv3 rv3 rv2 rv2 rv0 = _mm_unpacklo_pi16(rv0, rv0); // rv1 rv1 rv0 rv0 // Process for row 0. y0 = *((PM64) pbSrcY); // [YYYY | YYYY]; row 0 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 0. *((PDWORD) (pbDstX + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + 21)) = _mm_cvtsi64_si32(rgb3); // Process for row 1. y0 = *((PM64) (pbSrcY + iSrcYStride)); // [YYYY | YYYY]; row 1 y1 = _mm_unpackhi_pi8(y0, mz); // y7 y6 y5 y4 y0 = _mm_unpacklo_pi8(y0, mz); // y3 y2 y1 y0 y1 = _mm_subs_pi16(y1, g_mSub10); y0 = _mm_subs_pi16(y0, g_mSub10); y1 = _mm_mullo_pi16(y1, g_mYYMul); y0 = _mm_mullo_pi16(y0, g_mYYMul); g1 = _mm_subs_pi16(y1, guv1); // g7 g6 g5 g4 g0 = _mm_subs_pi16(y0, guv0); // g3 g2 g1 g0 g1 = _mm_srai_pi16(g1, SCALEBITS); g0 = _mm_srai_pi16(g0, SCALEBITS); g0 = _mm_packs_pu16(g0, g1); // g7 g6 ...g1 g0 b1 = _mm_adds_pi16(y1, bu1); b0 = _mm_adds_pi16(y0, bu0); b1 = _mm_srai_pi16(b1, SCALEBITS); b0 = _mm_srai_pi16(b0, SCALEBITS); b0 = _mm_packs_pu16(b0, b1); r1 = _mm_adds_pi16(y1, rv1); r0 = _mm_adds_pi16(y0, rv0); r1 = _mm_srai_pi16(r1, SCALEBITS); r0 = _mm_srai_pi16(r0, SCALEBITS); r0 = _mm_packs_pu16(r0, r1); r1 = _mm_unpackhi_pi8(b0, r0); // r7 b7 r6 b6 r5 b5 r4 b4 r0 = _mm_unpacklo_pi8(b0, r0); // r3 b3 r2 b2 r1 b1 r0 b0 g1 = _mm_unpackhi_pi8(g0, mz); // 0 g7 0 g6 0 g5 0 g4 g0 = _mm_unpacklo_pi8(g0, mz); // 0 g3 0 g2 0 g1 0 g0 rgb0 = _mm_unpacklo_pi8(r0, g0); // 0 r1 g1 b1 0 r0 g0 b0 rgb1 = _mm_unpackhi_pi8(r0, g0); // 0 r3 g3 b3 0 r2 g2 b2 rgb2 = _mm_unpacklo_pi8(r1, g1); // 0 r5 g5 b5 0 r4 g4 b4 rgb3 = _mm_unpackhi_pi8(r1, g1); // 0 r7 g7 b7 0 r6 g6 b6 // Write out row 1. *((PDWORD) (pbDstX + iDstXStride + 0)) = _mm_cvtsi64_si32(rgb0); rgb0 = _mm_srli_si64(rgb0, 32); *((PDWORD) (pbDstX + iDstXStride + 3)) = _mm_cvtsi64_si32(rgb0); *((PDWORD) (pbDstX + iDstXStride + 6)) = _mm_cvtsi64_si32(rgb1); rgb1 = _mm_srli_si64(rgb1, 32); *((PDWORD) (pbDstX + iDstXStride + 9)) = _mm_cvtsi64_si32(rgb1); *((PDWORD) (pbDstX + iDstXStride + 12)) = _mm_cvtsi64_si32(rgb2); rgb2 = _mm_srli_si64(rgb2, 32); *((PDWORD) (pbDstX + iDstXStride + 15)) = _mm_cvtsi64_si32(rgb2); *((PDWORD) (pbDstX + iDstXStride + 18)) = _mm_cvtsi64_si32(rgb3); rgb3 = _mm_srli_si64(rgb3, 32); *((PDWORD) (pbDstX + iDstXStride + 21)) = _mm_cvtsi64_si32(rgb3); pbDstX += 24; pbSrcY += 8; pbSrcU += 4; pbSrcV += 4; } for (x = (uWidth & 7) / 2; x; x--) { bu = g_iBUTab[pbSrcU[0]]; guv = g_iGUTab[pbSrcU[0]] + g_iGVTab[pbSrcV[0]]; rv = g_iRVTab[pbSrcV[0]]; yy = g_iYYTab[pbSrcY[0]]; pbDstX[0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[1]]; pbDstX[3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[5] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[iSrcYStride]]; pbDstX[iDstXStride + 0] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[iDstXStride + 1] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[iDstXStride + 2] = _Clip((yy + rv) >> SCALEBITS_OUT); yy = g_iYYTab[pbSrcY[iSrcYStride + 1]]; pbDstX[iDstXStride + 3] = _Clip((yy + bu) >> SCALEBITS_OUT); pbDstX[iDstXStride + 4] = _Clip((yy - guv) >> SCALEBITS_OUT); pbDstX[iDstXStride + 5] = _Clip((yy + rv) >> SCALEBITS_OUT); pbDstX += 6; pbSrcY += 2; pbSrcU++; pbSrcV++; } pbDstX += iDstXDif + iDstXStride; pbSrcY += iSrcYDif + iSrcYStride; pbSrcU += iSrcUvDif; pbSrcV += iSrcUvDif; } _mm_empty(); }
mlib_status mlib_m_sconv7x7_16nw_4( mlib_image *dst, mlib_image *src, mlib_s32 *hkernel, mlib_s32 *vkernel, mlib_s32 scalef_expon) { GET_SRC_DST_PARAMETERS(mlib_s16); __m64 hker0, hker1, hker2, hker3, hker4, hker5, hker6; __m64 vker0, vker1, vker2, vker3, vker4, vker5, vker6; __m64 s0, s1, s2, s3, s4, s5, s6, v0, v1, v2, v3, v4, v5, v6, rr, rh, rl; __m64 zero, _rnd; __m64 *sp0, *sp1, *sp2, *sp3, *sp4, *sp5, *sp6, *dp; mlib_s32 shift, kerh_sum; mlib_s32 i, j; width -= KSIZE1; height -= KSIZE1; width *= NCHAN; dl += (KSIZE / 2) * (dll + NCHAN); GET_KERN(); zero = _mm_setzero_si64(); for (j = 0; j < height; j++) { sp0 = (__m64 *) sl; sp1 = (__m64 *) (sl + sll); sp2 = (__m64 *) (sl + 2 * sll); sp3 = (__m64 *) (sl + 3 * sll); sp4 = (__m64 *) (sl + 4 * sll); sp5 = (__m64 *) (sl + 5 * sll); sp6 = (__m64 *) (sl + 6 * sll); dp = (__m64 *) dl; PREP_V(v1); PREP_V(v2); PREP_V(v3); PREP_V(v4); PREP_V(v5); PREP_V(v6); for (i = 0; i < width / 4; i++) { CONV_7x7(); dp[i] = rr; } if (width & 3) { __m64 mask = ((__m64 *) mlib_mask64_arr)[2 * (width & 3)]; CONV_7x7(); dp[i] = _mm_or_si64(_mm_and_si64(mask, rr), _mm_andnot_si64(mask, dp[i])); } sl += sll; dl += dll; } _mm_empty(); return (MLIB_SUCCESS); }
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos_ps(v4sfu *xptr, v4sfu *sptr, v4sfu *cptr) { __m128 x=*((__m128 *)xptr), *s=(__m128 *)sptr, *c=(__m128 *)cptr, xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y; #ifdef USE_SSE2 __m128i emm0, emm2, emm4; #else __m64 mm0, mm1, mm2, mm3, mm4, mm5; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm_and_ps(sign_bit_sin, *(__m128*)_ps_sign_mask); /* scale by 4/Pi */ y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI); #ifdef USE_SSE2 /* store the integer part of y in emm2 */ emm2 = _mm_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1); emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1); y = _mm_cvtepi32_ps(emm2); emm4 = emm2; /* get the swap sign flag for the sine */ emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4); emm0 = _mm_slli_epi32(emm0, 29); __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0); /* get the polynom selection mask for the sine*/ emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2); emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); __m128 poly_mask = _mm_castsi128_ps(emm2); #else /* store the integer part of y in mm2:mm3 */ xmm3 = _mm_movehl_ps(xmm3, y); mm2 = _mm_cvttps_pi32(y); mm3 = _mm_cvttps_pi32(xmm3); /* j=(j+1) & (~1) (see the cephes sources) */ mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1); mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1); mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1); y = _mm_cvtpi32x2_ps(mm2, mm3); mm4 = mm2; mm5 = mm3; /* get the swap sign flag for the sine */ mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4); mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4); mm0 = _mm_slli_pi32(mm0, 29); mm1 = _mm_slli_pi32(mm1, 29); __m128 swap_sign_bit_sin; COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit_sin); /* get the polynom selection mask for the sine */ mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2); mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2); mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64()); mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64()); __m128 poly_mask; COPY_MM_TO_XMM(mm2, mm3, poly_mask); #endif /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m128*)_ps_minus_cephes_DP1; xmm2 = *(__m128*)_ps_minus_cephes_DP2; xmm3 = *(__m128*)_ps_minus_cephes_DP3; xmm1 = _mm_mul_ps(y, xmm1); xmm2 = _mm_mul_ps(y, xmm2); xmm3 = _mm_mul_ps(y, xmm3); x = _mm_add_ps(x, xmm1); x = _mm_add_ps(x, xmm2); x = _mm_add_ps(x, xmm3); #ifdef USE_SSE2 emm4 = _mm_sub_epi32(emm4, *(__m128i*)_pi32_2); emm4 = _mm_andnot_si128(emm4, *(__m128i*)_pi32_4); emm4 = _mm_slli_epi32(emm4, 29); __m128 sign_bit_cos = _mm_castsi128_ps(emm4); #else /* get the sign flag for the cosine */ mm4 = _mm_sub_pi32(mm4, *(__m64*)_pi32_2); mm5 = _mm_sub_pi32(mm5, *(__m64*)_pi32_2); mm4 = _mm_andnot_si64(mm4, *(__m64*)_pi32_4); mm5 = _mm_andnot_si64(mm5, *(__m64*)_pi32_4); mm4 = _mm_slli_pi32(mm4, 29); mm5 = _mm_slli_pi32(mm5, 29); __m128 sign_bit_cos; COPY_MM_TO_XMM(mm4, mm5, sign_bit_cos); _mm_empty(); /* good-bye mmx */ #endif sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ __m128 z = _mm_mul_ps(x,x); y = *(__m128*)_ps_coscof_p0; y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2); y = _mm_mul_ps(y, z); y = _mm_mul_ps(y, z); __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); y = _mm_add_ps(y, *(__m128*)_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m128 y2 = *(__m128*)_ps_sincof_p0; y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1); y2 = _mm_mul_ps(y2, z); y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2); y2 = _mm_mul_ps(y2, z); y2 = _mm_mul_ps(y2, x); y2 = _mm_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; __m128 ysin2 = _mm_and_ps(xmm3, y2); __m128 ysin1 = _mm_andnot_ps(xmm3, y); y2 = _mm_sub_ps(y2,ysin2); y = _mm_sub_ps(y, ysin1); xmm1 = _mm_add_ps(ysin1,ysin2); xmm2 = _mm_add_ps(y,y2); /* update the sign */ *s = _mm_xor_ps(xmm1, sign_bit_sin); *c = _mm_xor_ps(xmm2, sign_bit_cos); }