void scale_frame_down2x2_simd(yuv_frame_t* sin, yuv_frame_t* sout) { int wo=sout->width; int ho=sout->height; int so=sout->stride_y; int si=sin->stride_y; int i, j; v128 ones = v128_dup_8(1); v128 z = v128_dup_8(0); for (i=0; i<ho; ++i) { for (j=0; j<=wo-8; j+=8) { v128 a = v128_load_aligned(&sin->y[(2*i+0)*si+2*j]); v128 b = v128_load_aligned(&sin->y[(2*i+1)*si+2*j]); v128 c = v128_avg_u8(a,b); v128 d = v128_shr_s16(v128_madd_us8(c,ones),1); v64_store_aligned(&sout->y[i*so+j], v128_low_v64(v128_pack_s16_u8(z,d))); } for (; j<wo; ++j) { sout->y[i*so+j]=( ((sin->y[(2*i+0)*si+(2*j+0)] + sin->y[(2*i+1)*si+(2*j+0)]+1)>>1)+ + ((sin->y[(2*i+0)*si+(2*j+1)] + sin->y[(2*i+1)*si+(2*j+1)]+1)>>1) )>>1; } } #if USE_CHROMA int soc=sout->stride_c; int sic=sin->stride_c; ho /= 2; wo /= 2; for (int i=0; i<ho; ++i) { for (j=0; j<=wo-8; j+=8) { v128 a = v128_load_aligned(&sin->u[(2*i+0)*sic+2*j]); v128 b = v128_load_aligned(&sin->u[(2*i+1)*sic+2*j]); v128 c = v128_avg_u8(a,b); v128 d = v128_shr_s16(v128_madd_us8(c,ones),1); v64_store_aligned(&sout->u[i*soc+j], v128_low_v64(v128_pack_s16_u8(z,d))); } for (; j<wo; ++j) { sout->u[i*soc+j]=( ((sin->u[(2*i+0)*sic+(2*j+0)] + sin->u[(2*i+1)*sic+(2*j+0)]+1)>>1)+ + ((sin->u[(2*i+0)*sic+(2*j+1)] + sin->u[(2*i+1)*sic+(2*j+1)]+1)>>1) )>>1; } for (j=0; j<=wo-8; j+=8) { v128 a = v128_load_aligned(&sin->v[(2*i+0)*sic+2*j]); v128 b = v128_load_aligned(&sin->v[(2*i+1)*sic+2*j]); v128 c = v128_avg_u8(a,b); v128 d = v128_shr_s16(v128_madd_us8(c,ones),1); v64_store_aligned(&sout->v[i*soc+j], v128_low_v64(v128_pack_s16_u8(z,d))); } for (; j<wo; ++j) { sout->v[i*soc+j]=( ((sin->v[(2*i+0)*sic+(2*j+0)] + sin->v[(2*i+1)*sic+(2*j+0)]+1)>>1)+ + ((sin->v[(2*i+0)*sic+(2*j+1)] + sin->v[(2*i+1)*sic+(2*j+1)]+1)>>1) )>>1; } } #endif pad_yuv_frame(sout); }
int ssd_calc_simd(uint8_t *a, uint8_t *b, int astride, int bstride, int size) { int i, j; if (size == 8) { ssd64_internal s = v64_ssd_u8_init(); s = v64_ssd_u8(s, v64_load_aligned(a + 0*astride), v64_load_aligned(b + 0*bstride)); s = v64_ssd_u8(s, v64_load_aligned(a + 1*astride), v64_load_aligned(b + 1*bstride)); s = v64_ssd_u8(s, v64_load_aligned(a + 2*astride), v64_load_aligned(b + 2*bstride)); s = v64_ssd_u8(s, v64_load_aligned(a + 3*astride), v64_load_aligned(b + 3*bstride)); s = v64_ssd_u8(s, v64_load_aligned(a + 4*astride), v64_load_aligned(b + 4*bstride)); s = v64_ssd_u8(s, v64_load_aligned(a + 5*astride), v64_load_aligned(b + 5*bstride)); s = v64_ssd_u8(s, v64_load_aligned(a + 6*astride), v64_load_aligned(b + 6*bstride)); s = v64_ssd_u8(s, v64_load_aligned(a + 7*astride), v64_load_aligned(b + 7*bstride)); return v64_ssd_u8_sum(s); } else { ssd128_internal s = v128_ssd_u8_init(); for (i = 0; i < size; i++) for (j = 0; j < size; j += 16) s = v128_ssd_u8(s, v128_load_aligned(a + i*astride + j), v128_load_aligned(b + i*bstride + j)); return v128_ssd_u8_sum(s); } }
int sad_calc_simd(uint8_t *a, uint8_t *b, int astride, int bstride, int width, int height) { int i, j; if (width == 8) { sad64_internal s = v64_sad_u8_init(); for (i = 0; i < height; i += 4) { s = v64_sad_u8(s, v64_load_aligned(a + 0*astride), v64_load_aligned(b + 0*bstride)); s = v64_sad_u8(s, v64_load_aligned(a + 1*astride), v64_load_aligned(b + 1*bstride)); s = v64_sad_u8(s, v64_load_aligned(a + 2*astride), v64_load_aligned(b + 2*bstride)); s = v64_sad_u8(s, v64_load_aligned(a + 3*astride), v64_load_aligned(b + 3*bstride)); a += 4*astride; b += 4*bstride; } return v64_sad_u8_sum(s); } else { sad128_internal s = v128_sad_u8_init(); for (i = 0; i < height; i++) for (j = 0; j < width; j += 16) s = v128_sad_u8(s, v128_load_aligned(a + i*astride + j), v128_load_aligned(b + i*bstride + j)); return v128_sad_u8_sum(s); } }