static void highbd_convolve_avg_horiz(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; dst[x] = ROUND_POWER_OF_TWO(dst[x] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); x_q4 += x_step_q4; } src += src_stride; dst += dst_stride; } }
void aom_yv12_partial_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc, int hstart, int hend, int vstart, int vend) { int row; const uint8_t *src = src_bc->v_buffer; uint8_t *dst = dst_bc->v_buffer; if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src + vstart * src_bc->uv_stride + hstart); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst + vstart * dst_bc->uv_stride + hstart); for (row = vstart; row < vend; ++row) { memcpy(dst16, src16, (hend - hstart) * sizeof(uint16_t)); src16 += src_bc->uv_stride; dst16 += dst_bc->uv_stride; } return; } src = (src + vstart * src_bc->uv_stride + hstart); dst = (dst + vstart * dst_bc->uv_stride + hstart); for (row = vstart; row < vend; ++row) { memcpy(dst, src, (hend - hstart)); src += src_bc->uv_stride; dst += dst_bc->uv_stride; } }
static INLINE unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride, int width, int height) { int y, x; unsigned int sad = 0; const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) { const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]); sad += abs(pred - src[x]); } src += src_stride; a += a_stride; b += b_stride; m += m_stride; } sad = (sad + 31) >> 6; return sad; }
void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd) { int i, j; uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8); (void)bd; assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); for (i = 0; i < h; ++i) { const int m = mask[i]; for (j = 0; j < w; ++j) { dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j], src1[i * src1_stride + j]); } } }
void vpx_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc) { int row; const uint8_t *src = src_ybc->y_buffer; uint8_t *dst = dst_ybc->y_buffer; #if CONFIG_VP9_HIGHBITDEPTH if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (row = 0; row < src_ybc->y_height; ++row) { memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t)); src16 += src_ybc->y_stride; dst16 += dst_ybc->y_stride; } return; } #endif for (row = 0; row < src_ybc->y_height; ++row) { memcpy(dst, src, src_ybc->y_width); src += src_ybc->y_stride; dst += dst_ybc->y_stride; } }
static void highbd_convolve_avg_vert(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_y[k * src_stride] * y_filter[k]; dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] + clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd), 1); y_q4 += y_step_q4; } ++src; ++dst; } }
static INLINE unsigned int highbd_masked_sad4xh_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packs_epi32(pred_l, pred_r); const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m128i pred = _mm_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
static INLINE unsigned int highbd_masked_sad8xh_avx2( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_const = _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m256i one = _mm256_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); // Zero-extend mask to 16 bits const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)(m_ptr)), _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)))); const __m256i m_inv = _mm256_sub_epi16(mask_max, m); const __m256i data_l = _mm256_unpacklo_epi16(a, b); const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m256i data_r = _mm256_unpackhi_epi16(a, b); const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); src_ptr += src_stride << 1; a_ptr += a_stride << 1; b_ptr += b_stride << 1; m_ptr += m_stride << 1; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); return (sad + 31) >> 6; }
unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, vpx_highbd_calc16x16var_sse2, 16); return *sse; }
unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, unsigned int *sse) { int sum; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, vpx_highbd_calc8x8var_sse2, 8); return *sse; }
static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int width, int height) { int y, x; unsigned int sad = 0; const uint16_t *a = CONVERT_TO_SHORTPTR(a8); const uint16_t *b = CONVERT_TO_SHORTPTR(b8); for (y = 0; y < height; y++) { for (x = 0; x < width; x++) sad += abs(a[x] - b[x]); a += a_stride; b += b_stride; } return sad; }
static void highbd_var_filter_block2d_bil_first_pass( const uint8_t *src_ptr8, uint16_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, unsigned int output_width, const int16_t *vp9_filter) { unsigned int i, j; uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); for (i = 0; i < output_height; i++) { for (j = 0; j < output_width; j++) { output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + (int)src_ptr[pixel_step] * vp9_filter[1], FILTER_BITS); src_ptr++; } // Next row... src_ptr += src_pixels_per_line - output_width; output_ptr += output_width; } }
void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { int i, j; tran_low_t out[16 * 16]; tran_low_t *outptr = out; tran_low_t temp_in[16], temp_out[16]; const highbd_transform_2d ht = HIGH_IHT_16[tx_type]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Rows for (i = 0; i < 16; ++i) { ht.rows(input, outptr, bd); input += 16; outptr += 16; } // Columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; ht.cols(temp_in, temp_out, bd); for (j = 0; j < 16; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd); } } }
void vp10_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { int i, j; tran_low_t out[8 * 8]; tran_low_t *outptr = out; tran_low_t temp_in[8], temp_out[8]; const highbd_transform_2d ht = HIGH_IHT_8[tx_type]; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); // Inverse transform row vectors. for (i = 0; i < 8; ++i) { ht.rows(input, outptr, bd); input += 8; outptr += 8; } // Inverse transform column vectors. for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; ht.cols(temp_in, temp_out, bd); for (j = 0; j < 8; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd); } } }
/* TODO: Optimize this function for SSE. */ static void copy_sb8_16(AV1_COMMON *cm, int16_t *dst, int dstride, const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize) { int r, c; (void)cm; #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { const uint16_t *base = &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset]; for (r = 0; r < vsize; r++) { for (c = 0; c < hsize; c++) { dst[r * dstride + c] = base[r * sstride + c]; } } } else #endif { const uint8_t *base = &src[src_voffset * sstride + src_hoffset]; for (r = 0; r < vsize; r++) { for (c = 0; c < hsize; c++) { dst[r * dstride + c] = base[r * sstride + c]; } } } }
void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; /* Filter starting 3 lines back. The neon implementation will ignore the given * height and filter a multiple of 4 lines. Since this goes in to the temp * buffer which has lots of extra room and is subsequently discarded this is * safe if somewhat less than ideal. */ vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3), src_stride, CONVERT_TO_BYTEPTR(temp), w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd); /* Step into the temp buffer 3 lines to get the actual frame data */ vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); }
void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest8, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); const tran_low_t out1 = HIGHBD_WRAPLOW(dct_const_round_shift(out0 * cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); const int16x8_t dc = vdupq_n_s16(a1); uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i; if (a1 >= 0) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); for (i = 0; i < 8; ++i) { highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); } } else { for (i = 0; i < 8; ++i) { highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); } } }
void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride) { int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); for (i = 0; i < height; i++) { for (j = 0; j < width; j++) { const int tmp = pred[j] + ref[j]; comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1); } comp_pred += width; pred += width; ref += ref_stride; } }
void vp10_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8, int stride, int tx_type, int bd) { const highbd_transform_2d IHT_4[] = { { vpx_highbd_idct4_c, vpx_highbd_idct4_c }, // DCT_DCT = 0 { vpx_highbd_iadst4_c, vpx_highbd_idct4_c }, // ADST_DCT = 1 { vpx_highbd_idct4_c, vpx_highbd_iadst4_c }, // DCT_ADST = 2 { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c } // ADST_ADST = 3 }; uint16_t *dest = CONVERT_TO_SHORTPTR(dest8); int i, j; tran_low_t out[4 * 4]; tran_low_t *outptr = out; tran_low_t temp_in[4], temp_out[4]; // Inverse transform row vectors. for (i = 0; i < 4; ++i) { IHT_4[tx_type].rows(input, outptr, bd); input += 4; outptr += 4; } // Inverse transform column vectors. for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; IHT_4[tx_type].cols(temp_in, temp_out, bd); for (j = 0; j < 4; ++j) { dest[j * stride + i] = highbd_clip_pixel_add( dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd); } } }
void aom_highbd_blend_a64_vmask_sse4_1( uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd) { typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride, const uint16_t *src0, uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h); // Dimensions are: bd_index X width_index static const blend_fn blend[2][2] = { { // bd == 8 or 10 blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 blend_a64_vmask_b10_w4_sse4_1, // w == 4 }, { // bd == 12 blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 blend_a64_vmask_b12_w4_sse4_1, // w == 4 } }; assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); assert(h >= 1); assert(w >= 1); assert(IS_POWER_OF_TWO(h)); assert(IS_POWER_OF_TWO(w)); assert(bd == 8 || bd == 10 || bd == 12); if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, src1_stride, mask, w, h, bd); } else { uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, h); } }
void aom_highbd_upsampled_pred_sse2(uint16_t *pred, int width, int height, const uint8_t *ref8, const int ref_stride) { const int stride = ref_stride << 3; uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); int i, j; if (width >= 8) { // read 8 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 8) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); __m128i t0, t1, t2, t3; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t2 = _mm_unpacklo_epi16(s4, s5); t3 = _mm_unpacklo_epi16(s6, s7); t0 = _mm_unpacklo_epi32(t0, t1); t2 = _mm_unpacklo_epi32(t2, t3); t0 = _mm_unpacklo_epi64(t0, t2); _mm_storeu_si128((__m128i *)(pred), t0); pred += 8; ref += 64; // 8 * 8; } ref += stride - (width << 3); } } else { // read 4 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 4) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i t0, t1; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t0 = _mm_unpacklo_epi32(t0, t1); _mm_storel_epi64((__m128i *)(pred), t0); pred += 4; ref += 4 * 8; } ref += stride - (width << 3); } } }
void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src8, ptrdiff_t src_stride, const uint8_t *pred8, ptrdiff_t pred_stride, int bd) { int r, c; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); (void)bd; for (r = 0; r < rows; r++) { for (c = 0; c < cols; c++) { diff[c] = src[c] - pred[c]; } diff += diff_stride; pred += pred_stride; src += src_stride; } }
static void aq_highbd_variance64(const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, int w, int h, uint64_t *sse, uint64_t *sum) { int i, j; uint16_t *a = CONVERT_TO_SHORTPTR(a8); uint16_t *b = CONVERT_TO_SHORTPTR(b8); *sum = 0; *sse = 0; for (i = 0; i < h; i++) { for (j = 0; j < w; j++) { const int diff = a[j] - b[j]; *sum += diff; *sse += diff * diff; } a += a_stride; b += b_stride; } }
void vpx_highbd_convolve_copy_c(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int r; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_y; (void)filter_x_stride; (void)filter_y_stride; (void)bd; for (r = h; r > 0; --r) { memcpy(dst, src, w * sizeof(uint16_t)); src += src_stride; dst += dst_stride; } }
static double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1, int stride_img2, int width, int height, unsigned int bd) { int i, j; int samples = 0; double ssim_total = 0; // sample point start with each 4x4 location for (i = 0; i <= height - 8; i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) { for (j = 0; j <= width - 8; j += 4) { double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1, CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd); ssim_total += v; samples++; } } ssim_total /= samples; return ssim_total; }
void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_y; (void)filter_x_stride; (void)filter_y_stride; (void)bd; for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) { dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); } src += src_stride; dst += dst_stride; } }
int av1_clpf_decision(int k, int l, const YV12_BUFFER_CONFIG *rec, const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm, int block_size, int w, int h, unsigned int strength, unsigned int fb_size_log2, int8_t *res) { int m, n, sum0 = 0, sum1 = 0; for (m = 0; m < h; m++) { for (n = 0; n < w; n++) { int xpos = (l << fb_size_log2) + n * block_size; int ypos = (k << fb_size_log2) + m * block_size; if (fb_size_log2 == MAX_FB_SIZE_LOG2 || !cm->mi_grid_visible[ypos / MI_SIZE * cm->mi_stride + xpos / MI_SIZE] ->mbmi.skip) { #if CONFIG_AOM_HIGHBITDEPTH if (cm->use_highbitdepth) { aom_clpf_detect_hbd(CONVERT_TO_SHORTPTR(rec->y_buffer), CONVERT_TO_SHORTPTR(org->y_buffer), rec->y_stride, org->y_stride, xpos, ypos, rec->y_crop_width, rec->y_crop_height, &sum0, &sum1, strength, block_size, cm->bit_depth); } else { aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, org->y_stride, xpos, ypos, rec->y_crop_width, rec->y_crop_height, &sum0, &sum1, strength, block_size, cm->bit_depth); } #else aom_clpf_detect(rec->y_buffer, org->y_buffer, rec->y_stride, org->y_stride, xpos, ypos, rec->y_crop_width, rec->y_crop_height, &sum0, &sum1, strength, block_size, cm->bit_depth); #endif } } } *res = sum1 < sum0; return *res; }
void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc) { int row; const uint8_t *src = src_bc->v_buffer; uint8_t *dst = dst_bc->v_buffer; if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (row = 0; row < src_bc->uv_height; ++row) { memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t)); src16 += src_bc->uv_stride; dst16 += dst_bc->uv_stride; } return; } for (row = 0; row < src_bc->uv_height; ++row) { memcpy(dst, src, src_bc->uv_width); src += src_bc->uv_stride; dst += dst_bc->uv_stride; } }
static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p1_w = xx_loadl_64(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_w = xx_loadl_64(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); // Rounded absolute difference const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); n += 8; if (n % width == 0) pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); }