unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 64, 32, sse, &sum, vpx_get32x32var_avx2, 32); return *sse - (((int64_t)sum * sum) >> 11); }
unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, vpx_get16x16var_avx2, 16); return *sse - (((unsigned int)sum * sum) >> 8); }
unsigned int aom_variance32x16_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 32, 16, sse, &sum, aom_get32x32var_avx2, 32); return *sse - (((int64_t)sum * sum) >> 9); }
unsigned int aom_variance16x16_avx2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse) { int sum; variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, aom_get16x16var_avx2, 16); return *sse - (((uint32_t)((int64_t)sum * sum)) >> 8); }
unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse) { unsigned int var; int avg; // processing 32 elements vertically in parallel variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, &var, &avg, vp9_get32x32var_avx2, 32); *sse = var; return (var - (((int64_t)avg * avg) >> 10)); }
unsigned int vp9_variance16x16_avx2 ( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride, unsigned int *sse) { unsigned int var; int avg; variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg, vp9_get16x16var_avx2, 16); *sse = var; return (var - (((unsigned int)avg * avg) >> 8)); }