int32_t od_mc_compute_satd8_4x4_sse2(const unsigned char *src, int systride, const unsigned char *ref, int rystride) { int32_t satd; __m64 sums; __m64 a; __m64 b; __m64 c; __m64 d; a = od_load_convert_subtract_x4(src + 0*systride, ref + 0*rystride); b = od_load_convert_subtract_x4(src + 1*systride, ref + 1*rystride); c = od_load_convert_subtract_x4(src + 2*systride, ref + 2*rystride); d = od_load_convert_subtract_x4(src + 3*systride, ref + 3*rystride); /*Vertical 1D transform.*/ od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); od_transpose16x4(&a, &b, &c, &d); /*Horizontal 1D transform.*/ od_mc_butterfly_2x2_16x4(&a, &b, &c, &d); /*Use the fact that (abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) to merge the final butterfly stage with the calculating the absolute values and the first stage of accumulation. Calculates (abs(a+b)+abs(a-b))/2-0x7FFF. An offset must be added to the final sum before rounding to account for subtracting 0x7FFF.*/ a = _mm_sub_pi16(_mm_max_pi16(a, b), _mm_adds_pi16(_mm_add_pi16(a, b), _mm_set1_pi16(0x7FFF))); c = _mm_sub_pi16(_mm_max_pi16(c, d), _mm_adds_pi16(_mm_add_pi16(c, d), _mm_set1_pi16(0x7FFF))); /*Take the sum of all the absolute values.*/ sums = _mm_add_pi16(a, c); /*Sum the elements of the vector.*/ sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(0, 1, 2, 3))); sums = _mm_add_pi16(sums, _mm_shuffle_pi16(sums, _MM_SHUFFLE(2, 3, 0, 1))); sums = _mm_unpacklo_pi16(sums, _mm_setzero_si64()); satd = _mm_cvtsi64_si32(sums); /*Subtract the offset (8) and round.*/ satd = (satd + 1 - 8) >> 1; #if defined(OD_CHECKASM) { int32_t c_satd; c_satd = od_mc_compute_satd8_4x4_c(src, systride, ref, rystride); if (satd != c_satd) { fprintf(stderr, "od_mc_compute_satd %ix%i check failed: %i!=%i\n", 4, 4, satd, c_satd); } } #endif return satd; }
unsigned int interpolhline_mmx_2(unsigned char* image){ __m64 mm_A = _mm_set_pi16(image[1],image[0],image[-1],image[-2]); __m64 mm_B = _mm_set_pi16(image[2],image[1],image[0],image[-1]); __m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); __m64 mm_D = _mm_set_pi16(image[4],image[3],image[2],image[1]); __m64 mm_E = _mm_set_pi16(image[5],image[4],image[3],image[2]); __m64 mm_F = _mm_set_pi16(image[6],image[5],image[4],image[3]); __m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F __m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16 __m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E __m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D __m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2 __m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E) __m64 mm_5 = _mm_set_pi16(5,5,5,5); __m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5 __m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5 __m64 mm_zero = _mm_setzero_si64(); __m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0 __m64 mm_ret = _mm_srai_pi16(mm_clip,5); __m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255 __m64 result =_mm_packs_pu16(mm_clip1,mm_zero); unsigned int ret = _mm_cvtsi64_si32(result); empty(); return ret; }
__m64 interpolhline64_mmx(unsigned char* image){ __m64 mm_A = _mm_set_pi16(image[3],image[2],image[1],image[0]); __m64 mm_B = _mm_set_pi16(image[4],image[3],image[2],image[1]); __m64 mm_C = _mm_set_pi16(image[5],image[4],image[3],image[2]); __m64 mm_D = _mm_set_pi16(image[6],image[5],image[4],image[3]); __m64 mm_E = _mm_set_pi16(image[7],image[6],image[5],image[4]); __m64 mm_F = _mm_set_pi16(image[8],image[7],image[6],image[5]); __m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F __m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E __m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D __m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2 __m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E) __m64 mm_5 = _mm_set_pi16(5,5,5,5); __m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5 __m64 mm_result = _mm_add_pi16(mm_inter_3,mm_AF);//A + F + 16 + (((C + D) << 2)-(B + E))*5 return(mm_result); }
unsigned int interpolvline_mmx_3(unsigned char* image, int PicWidthInPix){ __m64 mm_A = _mm_set_pi16(image[-2 * PicWidthInPix + 3],image[-2 * PicWidthInPix + 2],image[-2 * PicWidthInPix + 1],image[-2 * PicWidthInPix]); __m64 mm_B = _mm_set_pi16(image[-1 * PicWidthInPix + 3],image[-1 * PicWidthInPix + 2],image[-1 * PicWidthInPix + 1],image[-1 * PicWidthInPix]); __m64 mm_C = _mm_set_pi16(image[3],image[2],image[1],image[0]); __m64 mm_D = _mm_set_pi16(image[1 * PicWidthInPix + 3],image[1 * PicWidthInPix + 2],image[1 * PicWidthInPix + 1],image[1 * PicWidthInPix]); __m64 mm_E = _mm_set_pi16(image[2 * PicWidthInPix + 3],image[2 * PicWidthInPix + 2],image[2 * PicWidthInPix + 1],image[2 * PicWidthInPix]); __m64 mm_F = _mm_set_pi16(image[3 * PicWidthInPix + 3],image[3 * PicWidthInPix + 2],image[3 * PicWidthInPix + 1],image[3 * PicWidthInPix]); __m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F __m64 mm_inter0 = _mm_add_pi16(mm_AF,_mm_set_pi16(16,16,16,16));//A + F + 16 __m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E __m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D __m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2 __m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E) __m64 mm_5 = _mm_set_pi16(5,5,5,5); __m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5 __m64 mm_result = _mm_add_pi16(mm_inter_3,mm_inter0);//A + F + 16 + (((C + D) << 2)-(B + E))*5 __m64 mm_zero = _mm_setzero_si64(); __m64 mm_clip = _mm_max_pi16(mm_result,mm_zero);//Clip with 0 __m64 mm_ret = _mm_srai_pi16(mm_clip,5); __m64 mm_clip1 = _mm_min_pi16(mm_ret,_mm_set_pi16(255,255,255,255)); //Clip with 255 __m64 test = _mm_avg_pu8(mm_clip1,mm_D);//(ptr_img[0] + ptr_rf[0] + 1) >> 1 __m64 test1 =_mm_packs_pu16(test,mm_zero); unsigned int ret = _mm_cvtsi64_si32(test1); empty(); return ret; }
OD_SIMD_INLINE __m64 od_load_convert_subtract_x4(const unsigned char *src_p, const unsigned char *ref_p) { __m64 src_vec; __m64 ref_vec; src_vec = _mm_cvtsi32_si64(*((uint32_t *)src_p)); ref_vec = _mm_cvtsi32_si64(*((uint32_t *)ref_p)); src_vec = _mm_unpacklo_pi8(src_vec, ref_vec); ref_vec = _mm_unpacklo_pi8(ref_vec, ref_vec); return _mm_sub_pi16(src_vec, ref_vec); }
OD_SIMD_INLINE void od_mc_butterfly_2x2_16x4(__m64 *t0, __m64 *t1, __m64 *t2, __m64 *t3) { __m64 a; __m64 b; __m64 c; __m64 d; /*a = t0 + t1, c = (t0 + t1) - (t1 + t1) = t0 - t1 b = t2 + t3, d = (t2 + t3) - (t3 + t3) = t2 - t3*/ a = _mm_add_pi16(*t0, *t1); c = _mm_add_pi16(*t1, *t1); c = _mm_sub_pi16(a, c); b = _mm_add_pi16(*t2, *t3); d = _mm_add_pi16(*t3, *t3); d = _mm_sub_pi16(b, d); *t0 = a; *t1 = b; *t2 = c; *t3 = d; }
void weak_horizontal_chroma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha , const unsigned char beta, const unsigned char tc0) { __m64 mp1 = _mm_set_pi16( 0,0,pix[-2*xstride + 1], pix[-2*xstride]); __m64 mp0 = _mm_set_pi16( 0,0,pix[-1*xstride + 1], pix[-1*xstride]); __m64 mq0 = _mm_set_pi16( 0,0,pix[1], pix[0]); __m64 mq1 = _mm_set_pi16( 0,0,pix[xstride + 1], pix[xstride]); __m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0) __m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0) __m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0) //To calculate the mask __m64 malpha = _mm_set_pi16(0,0,alpha,alpha); __m64 malphab = _mm_set_pi16(0,0,-alpha,-alpha); __m64 mbeta = _mm_set_pi16(0,0,beta,beta); __m64 mbetab = _mm_set_pi16(0,0,-beta,-beta); __m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab)); __m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab)); __m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab)); __m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2); __m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) __m64 mlshift = _mm_set_pi16(0,0,0,2); __m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2; __m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1) __m64 madd4 = _mm_set_pi16(4,4,4,4); __m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4; __m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 + inter_2) __m64 mrshift3 = _mm_set_pi16(0,0,0,3); __m64 minter5 = _mm_sra_pi16(minter_4, mrshift3); //Clip3 __m64 m_tc0 = _mm_set_pi16(0,0,tc0,tc0); __m64 m_tcb0 = _mm_set_pi16(0,0,-tc0,-tc0); __m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 ); __m64 merror2 = _mm_and_si64 (mres_c3,first_mask); __m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B); __m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B); __m64 mrshift = _mm_set_pi16(0,0,0,1); *((unsigned short* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift)); *((unsigned short* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift)); empty(); }
//n_2 __m64 interpolvline64_mmx(unsigned char* image, const unsigned short PicWidthInPix){ __m64 mm_A = _mm_set_pi16(image[1 * PicWidthInPix],image[0],image[-1 * PicWidthInPix],image[-2 * PicWidthInPix]); __m64 mm_B = _mm_set_pi16(image[2 * PicWidthInPix],image[1 * PicWidthInPix],image[0],image[-1 * PicWidthInPix]); __m64 mm_C = _mm_set_pi16(image[3 * PicWidthInPix],image[2 * PicWidthInPix],image[1 * PicWidthInPix],image[0]); __m64 mm_D = _mm_set_pi16(image[4 * PicWidthInPix],image[3 * PicWidthInPix],image[2 * PicWidthInPix],image[1 * PicWidthInPix]); __m64 mm_E = _mm_set_pi16(image[5 * PicWidthInPix],image[4 * PicWidthInPix],image[3 * PicWidthInPix],image[2 * PicWidthInPix]); __m64 mm_F = _mm_set_pi16(image[6 * PicWidthInPix],image[5 * PicWidthInPix],image[4 * PicWidthInPix],image[3 * PicWidthInPix]); __m64 mm_AF = _mm_add_pi16(mm_A,mm_F);//A + F __m64 mm_BE = _mm_add_pi16(mm_B,mm_E);//B + E __m64 mm_CD = _mm_add_pi16(mm_C,mm_D);//C + D __m64 mm_CDS = _mm_slli_pi16(mm_CD,2);//(C + D) << 2 __m64 mm_inter1 = _mm_sub_pi16(mm_CDS,mm_BE);//((C + D) << 2)-(B + E) __m64 mm_5 = _mm_set_pi16(5,5,5,5); __m64 mm_inter_3 = _mm_mullo_pi16(mm_inter1, mm_5);//(((C + D) << 2)-(B + E))*5 __m64 mm_result = _mm_add_pi16(mm_inter_3,mm_AF);//A + F + 16 + (((C + D) << 2)-(B + E))*5 empty(); return(mm_result); }
__m64 test42(__m64 a, __m64 b) { // CHECK: psubw return _mm_sub_pi16(a, b); }
//mbl test with Pocket_PC void weak_horizontal_luma_MMX(unsigned char pix[], const int xstride, const unsigned char alpha, const unsigned char beta, const unsigned char tc0){ __m64 mp2 = _mm_set_pi16(pix[-3*xstride + 3], pix[-3*xstride + 2], pix[-3*xstride + 1], pix[-3*xstride]); __m64 mp1 = _mm_set_pi16(pix[-2*xstride + 3], pix[-2*xstride + 2], pix[-2*xstride + 1], pix[-2*xstride]); __m64 mp0 = _mm_set_pi16(pix[-1*xstride + 3], pix[-1*xstride + 2], pix[-1*xstride + 1], pix[-1*xstride]); __m64 mq0 = _mm_set_pi16(pix[3], pix[2], pix[1], pix[0]); __m64 mq1 = _mm_set_pi16(pix[xstride + 3], pix[xstride + 2], pix[xstride + 1], pix[xstride]); __m64 mq2 = _mm_set_pi16(pix[2*xstride + 3], pix[2*xstride + 2], pix[2*xstride + 1], pix[2*xstride]); __m64 mrshift = _mm_set_pi16(0,0,0,1); __m64 maddp0_q0 = _mm_avg_pu8(mp0,mq0); //addp0_q0 = (p0 + q0 + 1) >> 1; __m64 maddp2 = _mm_add_pi16(maddp0_q0,mp2); //addp2 = (p2 + addp0_q0); __m64 maddq2 = _mm_add_pi16(maddp0_q0,mq2); //addp2 = (p2 + addp0_q0); __m64 maddp2_s = _mm_srl_pi16(maddp2,mrshift); //addp2 = (p2 + addp0_q0) >> 1; __m64 maddq2_s = _mm_srl_pi16(maddq2,mrshift); //addp2 = (p2 + addp0_q0) >> 1; __m64 mp1_c = _mm_sub_pi16(maddp2_s, mp1); //addp2 - p1 __m64 mq1_c = _mm_sub_pi16(maddq2_s, mq1); // addq2 - q1 //To calculate the mask __m64 malpha = _mm_set_pi16(alpha,alpha,alpha,alpha); __m64 malphab = _mm_set_pi16(-alpha,-alpha,-alpha,-alpha); __m64 mbeta = _mm_set_pi16(beta,beta,beta,beta); __m64 mbetab = _mm_set_pi16(-beta,-beta,-beta,-beta); __m64 mdiff_p0_q0 = _mm_sub_pi16(mq0,mp0); //abs(q0 - p0) __m64 mdiff_p1_p0 = _mm_sub_pi16(mp0,mp1); //abs(p1 - p0) __m64 mdiff_q1_q0 = _mm_sub_pi16(mq0, mq1); //abs(q1 - q0) __m64 mdiff_p2_p0 = _mm_sub_pi16(mp2,mp0); //abs(p2 - p0 )) __m64 mdiff_q2_q0 = _mm_sub_pi16(mq2,mq0); //abs(q2 - q0) __m64 mask0 = _mm_and_si64( _mm_cmpgt_pi16(malpha, mdiff_p0_q0), _mm_cmpgt_pi16(mdiff_p0_q0,malphab)); __m64 mask1 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p1_p0), _mm_cmpgt_pi16(mdiff_p1_p0,mbetab)); __m64 mask2 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q1_q0), _mm_cmpgt_pi16(mdiff_q1_q0,mbetab)); __m64 mask3 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_p2_p0), _mm_cmpgt_pi16(mdiff_p2_p0,mbetab)); __m64 mask4 = _mm_and_si64( _mm_cmpgt_pi16(mbeta, mdiff_q2_q0), _mm_cmpgt_pi16(mdiff_q2_q0,mbetab)); __m64 first_mask = _mm_and_si64 (_mm_and_si64 (mask0,mask1),mask2); //(abs(q0 - p0) < alpha) && (abs(p1 - p0) < beta) && (abs(q1 - q0) < beta) __m64 second_mask = _mm_and_si64 (first_mask,mask3); __m64 third_mask = _mm_and_si64 (first_mask,mask4); __m64 mdiff_q0_p0 = _mm_sub_pi16(mq0,mp0); //(q0 - p0) __m64 mlshift = _mm_set_pi16(0,0,0,2); __m64 minter_1 = _mm_sll_pi16(mdiff_q0_p0, mlshift);//inter_1 = (q0 - p0 ) << 2; __m64 minter_2 = _mm_sub_pi16(mp1, mq1);//(p1 - q1) __m64 madd4 = _mm_set_pi16(4,4,4,4); __m64 minter_3 = _mm_add_pi16(minter_2, madd4);//inter_2 = (p1 - q1) + 4; __m64 minter_4 = _mm_add_pi16(minter_3,minter_1); //(inter_1 + inter_2) __m64 mrshift3 = _mm_set_pi16(0,0,0,3); __m64 minter5 = _mm_sra_pi16(minter_4, mrshift3); //Clip3 __m64 m_tc0 = _mm_set_pi16(tc0,tc0,tc0,tc0); __m64 m_tcb0 = _mm_set_pi16(-tc0,-tc0,-tc0,-tc0); __m64 mres_c1 = _mm_min_pi16(_mm_max_pi16(mp1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addp2 - p1 ); __m64 mres_c2 = _mm_min_pi16(_mm_max_pi16(mq1_c,m_tcb0),m_tc0); //CLIP3(-tc0, tc0, addq2 - q1 ); __m64 merror0 = _mm_and_si64 (mres_c1,second_mask); __m64 merror1 = _mm_and_si64 (mres_c2,third_mask); __m64 m_1 = _mm_set_pi16(1,1,1,1); __m64 m_and1 = _mm_and_si64 (mask3, m_1); //tc++; if abs( p2 - p0 ) < beta __m64 m_and2 = _mm_and_si64 (mask4, m_1); //tc++; if abs( q2 - q0 ) < beta __m64 m_tc = _mm_add_pi16(m_and2,_mm_add_pi16(m_tc0,m_and1)); __m64 m_tcn =_mm_sub_pi16(_mm_sub_pi16(m_tcb0,m_and1),m_and2); __m64 mres_c3 = _mm_min_pi16(_mm_max_pi16(minter5,m_tcn),m_tc); //CLIP3(-tc0, tc0, addp2 - p1 ); __m64 merror2 = _mm_and_si64 (mres_c3,first_mask); __m64 result_p1 = _mm_add_pi16(merror0,mp1); //_mm_shuffle_pi16(_mm_add_pi16(merror0,mp1), 0x1B); __m64 result_q1 = _mm_add_pi16(merror1,mq1); //_mm_shuffle_pi16(_mm_add_pi16(merror1,mq1), 0x1B); __m64 result_p0 = _mm_add_pi16(merror2,mp0); //_mm_shuffle_pi16(_mm_add_pi16(merror2,mq1), 0x1B); __m64 result_q0 = _mm_sub_pi16(mq0, merror2);//_mm_shuffle_pi16(_mm_sub_pi16(mq1, merror2), 0x1B); *((unsigned int* )(&pix[-2*xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p1,mrshift)); *((unsigned int* )(&pix[-xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_p0,mrshift)); *((unsigned int* )(&pix[0])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q0,mrshift)); *((unsigned int* )(&pix[xstride])) = _mm_cvtsi64_si32(_mm_packs_pu16(result_q1,mrshift)); empty(); }
__m64 test_mm_sub_pi16(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_sub_pi16 // CHECK: call x86_mmx @llvm.x86.mmx.psub.w return _mm_sub_pi16(a, b); }