void BlockMatcher::RefineMatchSubp(const int xpos, const int ypos, const MVector& mv_prediction, const float lambda) { BlockDiffParams dparams; dparams.SetBlockLimits( m_bparams , m_pic_data , xpos , ypos); m_cost_array[ypos][xpos].mvcost = GetVarUp( mv_prediction, m_mv_array[ypos][xpos]<<m_precision ); m_cost_array[ypos][xpos].SetTotal( lambda ); // Initialise to the best pixel value MvCostData best_costs( m_cost_array[ypos][xpos] ); MVector pel_mv( m_mv_array[ypos][xpos] ); MVector best_mv( pel_mv ); // If the integer value is good enough, bail out if ( best_costs.SAD < 2*dparams.Xl()*dparams.Yl() ) { m_mv_array[ypos][xpos] = m_mv_array[ypos][xpos]<<m_precision; return; } // Next, test the predictor. If that's good enough, bail out MvCostData pred_costs; pred_costs.mvcost = 0; pred_costs.SAD = m_subpeldiff[m_precision-1]->Diff( dparams, mv_prediction); pred_costs.total = pred_costs.SAD; if (pred_costs.SAD<2*dparams.Xl()*dparams.Yl() ) { m_mv_array[ypos][xpos] = mv_prediction; m_cost_array[ypos][xpos] = pred_costs; return; } // Now, let's see if we can do better than this MvCostData cand_costs; MVector cand_mv, old_best_mv; for (int i=1; i<=m_precision; ++i ) { best_mv = best_mv<<1; MVector temp_best_mv = best_mv; // Do a neighbourhood of best_mv // Stage 1 - look at the 4 nearest points cand_mv.x = best_mv.x - 1; cand_mv.y = best_mv.y; m_subpeldiff[i-1]->Diff( dparams, cand_mv , GetVarUp( mv_prediction, cand_mv<<(m_precision-i) ) , lambda , best_costs , temp_best_mv); cand_mv.x = best_mv.x + 1; cand_mv.y = best_mv.y; m_subpeldiff[i-1]->Diff( dparams, cand_mv , GetVarUp( mv_prediction, cand_mv<<(m_precision-i) ) , lambda , best_costs , temp_best_mv); cand_mv.x = best_mv.x; cand_mv.y = best_mv.y - 1; m_subpeldiff[i-1]->Diff( dparams, cand_mv , GetVarUp( mv_prediction, cand_mv<<(m_precision-i) ) , lambda , best_costs , temp_best_mv); cand_mv.x = best_mv.x; cand_mv.y = best_mv.y + 1; m_subpeldiff[i-1]->Diff( dparams, cand_mv , GetVarUp( mv_prediction, cand_mv<<(m_precision-i) ) , lambda , best_costs , temp_best_mv); // Stage 2. If we've done better than the original value, // look at the other two neighbours if ( temp_best_mv.x != best_mv.x ) { MVector new_best_mv = temp_best_mv; cand_mv.x = new_best_mv.x; cand_mv.y = new_best_mv.y - 1; m_subpeldiff[i-1]->Diff( dparams, cand_mv , GetVarUp( mv_prediction, cand_mv<<(m_precision-i) ) , lambda , best_costs , temp_best_mv); cand_mv.x = new_best_mv.x; cand_mv.y = new_best_mv.y + 1; m_subpeldiff[i-1]->Diff( dparams, cand_mv , GetVarUp( mv_prediction, cand_mv<<(m_precision-i) ) , lambda , best_costs , temp_best_mv); } else if ( temp_best_mv.y != best_mv.y ) { MVector new_best_mv = temp_best_mv; cand_mv.x = new_best_mv.x - 1; cand_mv.y = new_best_mv.y; m_subpeldiff[i-1]->Diff( dparams, cand_mv , GetVarUp( mv_prediction, cand_mv<<(m_precision-i) ) , lambda , best_costs , temp_best_mv); cand_mv.x = new_best_mv.x + 1; cand_mv.y = new_best_mv.y; m_subpeldiff[i-1]->Diff( dparams, cand_mv , GetVarUp( mv_prediction, cand_mv<<(m_precision-i) ) , lambda , best_costs , temp_best_mv); } best_mv = temp_best_mv; // Bail out if we can't do better than 10% worse than the predictor at // each stage if ( best_costs.total>1.1*pred_costs.total ) { m_mv_array[ypos][xpos] = mv_prediction; m_cost_array[ypos][xpos] = pred_costs; return; } }//i // Write the results in the arrays // ///////////////////////////////////// m_mv_array[ypos][xpos] = best_mv; m_cost_array[ypos][xpos] = best_costs; }
CalcValueType simple_intra_block_diff_mmx_4 ( const BlockDiffParams& dparams, const PicArray& pic_data, ValueType &dc_val) { __m64 tmp = _mm_set_pi16(0, 0, 0, 0); u_mmx_val u_sum; u_sum.i[0] = u_sum.i[1] = 0; ValueType *src = &(pic_data[dparams.Yp()][dparams.Xp()]); int height = dparams.Yl(); int width = dparams.Xl(); int stopX = (width>>2)<<2; int pic_next = (pic_data.LengthX() - width); CalcValueType mop_sum = 0; for (int j = 0; j < height; j++) { for (int i = 0; i < stopX; i+=4) { __m64 pic = *(__m64 *)src; // sum += (pic) tmp = _mm_xor_si64(tmp, tmp); tmp = _mm_unpackhi_pi16(pic, tmp); tmp = _mm_slli_pi32 (tmp, 16); tmp = _mm_srai_pi32 (tmp, 16); pic = _mm_unpacklo_pi16(pic, pic); pic = _mm_srai_pi32 (pic, 16); pic = _mm_add_pi32 (pic, tmp); u_sum.m = _mm_add_pi32 (u_sum.m, pic); src += 4; } // Mop up for (int i = stopX; i < width; ++i) { mop_sum += *src; src++; } src += pic_next; } CalcValueType int_dc = (u_sum.i[0] + u_sum.i[1] + mop_sum)/(width*height); dc_val = static_cast<ValueType>( int_dc ); // Now compute the resulting SAD __m64 dc = _mm_set_pi16 ( dc_val, dc_val , dc_val , dc_val); u_sum.m = _mm_xor_si64(u_sum.m, u_sum.m); // initialise sum to 0 mop_sum = 0; src = &(pic_data[dparams.Yp()][dparams.Xp()]); for (int j = 0; j < height; ++j) { for (int i = 0; i < stopX; i+=4) { __m64 pic = *(__m64 *)src; // pic - dc pic = _mm_sub_pi16 (pic, dc); // abs (pic - dc) tmp = _mm_srai_pi16(pic, 15); pic = _mm_xor_si64(pic, tmp); pic = _mm_sub_pi16 (pic, tmp); // sum += abs(pic -dc) tmp = _mm_xor_si64(tmp, tmp); tmp = _mm_unpackhi_pi16(pic, tmp); pic = _mm_unpacklo_pi16(pic, pic); pic = _mm_srai_pi32 (pic, 16); pic = _mm_add_pi32 (pic, tmp); u_sum.m = _mm_add_pi32 (u_sum.m, pic); src += 4; } // Mop up for (int i = stopX; i < width; ++i) { mop_sum += std::abs(*src - dc_val); src++; } src += pic_next; } CalcValueType intra_cost = u_sum.i[0] + u_sum.i[1] + mop_sum; _mm_empty(); return intra_cost; }