static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x0, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int taps) { int x, y, k; /* NOTE: This assumes that the filter table is 256-byte aligned. */ /* TODO(agrange) Modify to make independent of table alignment. */ const int16_t *const filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff); /* Adjust base pointer address for this source line */ src -= taps / 2 - 1; for (y = 0; y < h; ++y) { /* Initial phase offset */ int x_q4 = (int)(filter_x0 - filter_x_base) / taps; for (x = 0; x < w; ++x) { /* Per-pixel src offset */ const int src_x = x_q4 >> SUBPEL_BITS; int sum = 0; /* Pointer to filter to use */ const int16_t *const filter_x = filter_x_base + (x_q4 & SUBPEL_MASK) * taps; for (k = 0; k < taps; ++k) sum += src[src_x + k] * filter_x[k]; dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); /* Move to the next source pixel */ x_q4 += x_step_q4; } src += src_stride; dst += dst_stride; } }
void vpx_highbd_convolve_avg_c(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_y; (void)filter_x_stride; (void)filter_y_stride; (void)bd; for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) { dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); } src += src_stride; dst += dst_stride; } }
static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4, int x_step_q4, int w, int h, int bd) { int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= SUBPEL_TAPS / 2 - 1; for (y = 0; y < h; ++y) { int x_q4 = x0_q4; for (x = 0; x < w; ++x) { const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); x_q4 += x_step_q4; } src += src_stride; dst += dst_stride; } }
static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const subpel_kernel *y_filters, int y0_q4, int y_step_q4, int w, int h) { int x, y; src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_y[k * src_stride] * y_filter[k]; dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); y_q4 += y_step_q4; } ++src; ++dst; } }
static void highbd_var_filter_block2d_bil_second_pass( const uint16_t *src_ptr, uint16_t *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { for (j = 0; j < output_width; ++j) { output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1], FILTER_BITS); ++src_ptr; } src_ptr += src_pixels_per_line - output_width; output_ptr += output_width; } }
void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[16 * 16]; int16_t *outptr = out; int i, j; int16_t temp_in[16], temp_out[16]; // First transform rows for (i = 0; i < 16; ++i) { idct16_1d(input, outptr); input += 16; outptr += 16; } // Then transform columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; idct16_1d(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * dest_stride + i]); } }
void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[8 * 8]; int16_t *outptr = out; int i, j; int16_t temp_in[8], temp_out[8]; // First transform rows for (i = 0; i < 8; ++i) { idct8_1d(input, outptr); input += 8; outptr += 8; } // Then transform columns for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; idct8_1d(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]); } }
void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[32 * 32]; int16_t *outptr = out; int i, j; int16_t temp_in[32], temp_out[32]; // Rows for (i = 0; i < 32; ++i) { idct32_1d(input, outptr); input += 32; outptr += 32; } // Columns for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; idct32_1d(temp_in, temp_out); for (j = 0; j < 32; ++j) dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * dest_stride + i]); } }
// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal // or vertical direction to produce the filtered output block. Used to implement // first-pass of 2-D separable filter. // // Produces int32_t output to retain precision for next pass. Two filter taps // should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It // defines the offset required to move from one input to the next. static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr, uint16_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, unsigned int output_width, const int16_t *vp9_filter) { unsigned int i, j; for (i = 0; i < output_height; i++) { for (j = 0; j < output_width; j++) { output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] + (int)src_ptr[pixel_step] * vp9_filter[1], FILTER_BITS); src_ptr++; } // Next row... src_ptr += src_pixels_per_line - output_width; output_ptr += output_width; } }
void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) { int16_t out[4 * 4]; int16_t *outptr = out; int i, j; int16_t temp_in[4], temp_out[4]; // Rows for (i = 0; i < 4; ++i) { vp9_idct4_1d(input, outptr); input += 4; outptr += 4; } // Columns for (i = 0; i < 4; ++i) { for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i]; vp9_idct4_1d(temp_in, temp_out); for (j = 0; j < 4; ++j) dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4) + dest[j * dest_stride + i]); } }
void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type) { int i, j; int16_t out[16 * 16]; int16_t *outptr = out; int16_t temp_in[16], temp_out[16]; const transform_2d ht = IHT_16[tx_type]; // Rows for (i = 0; i < 16; ++i) { ht.rows(input, outptr); input += 16; outptr += 16; } // Columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i]; ht.cols(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * dest_stride + i]); } }
void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride, int tx_type) { int i, j; int16_t out[8 * 8]; int16_t *outptr = out; int16_t temp_in[8], temp_out[8]; const transform_2d ht = IHT_8[tx_type]; // inverse transform row vectors for (i = 0; i < 8; ++i) { ht.rows(input, outptr); input += 8; outptr += 8; } // inverse transform column vectors for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; ht.cols(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]); } }
static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4, int y_step_q4, int w, int h, int bd) { int x, y; uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); src -= src_stride * (SUBPEL_TAPS / 2 - 1); for (x = 0; x < w; ++x) { int y_q4 = y0_q4; for (y = 0; y < h; ++y) { const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK]; int k, sum = 0; for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_y[k * src_stride] * y_filter[k]; dst[y * dst_stride] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd); y_q4 += y_step_q4; } ++src; ++dst; } }
void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[32 * 32]; int16_t *outptr = out; int i, j; int16_t temp_in[32], temp_out[32]; // Rows for (i = 0; i < 32; ++i) { int16_t zero_coeff[16]; for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1]; for (j = 0; j < 8; ++j) zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; for (j = 0; j < 4; ++j) zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; for (j = 0; j < 2; ++j) zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1]; if (zero_coeff[0] | zero_coeff[1]) idct32(input, outptr); else vpx_memset(outptr, 0, sizeof(int16_t) * 32); input += 32; outptr += 32; } // Columns for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; idct32(temp_in, temp_out); for (j = 0; j < 32; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); } }
void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[32 * 32] = {0}; int16_t *outptr = out; int i, j; int16_t temp_in[32], temp_out[32]; // Rows // only upper-left 8x8 has non-zero coeff for (i = 0; i < 8; ++i) { idct32(input, outptr); input += 32; outptr += 32; } // Columns for (i = 0; i < 32; ++i) { for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i]; idct32(temp_in, temp_out); for (j = 0; j < 32; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); } }
void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[16 * 16] = { 0 }; int16_t *outptr = out; int i, j; int16_t temp_in[16], temp_out[16]; // First transform rows. Since all non-zero dct coefficients are in // upper-left 4x4 area, we only need to calculate first 4 rows here. for (i = 0; i < 4; ++i) { idct16(input, outptr); input += 16; outptr += 16; } // Then transform columns for (i = 0; i < 16; ++i) { for (j = 0; j < 16; ++j) temp_in[j] = out[j*16 + i]; idct16(temp_in, temp_out); for (j = 0; j < 16; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * stride + i]); } }
void vp9_idct8x8_10_add_c(const int16_t *input, uint8_t *dest, int stride) { int16_t out[8 * 8] = { 0 }; int16_t *outptr = out; int i, j; int16_t temp_in[8], temp_out[8]; // First transform rows // only first 4 row has non-zero coefs for (i = 0; i < 4; ++i) { idct8(input, outptr); input += 8; outptr += 8; } // Then transform columns for (i = 0; i < 8; ++i) { for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i]; idct8(temp_in, temp_out); for (j = 0; j < 8; ++j) dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * stride + i]); } }
void yuv_bgr_convert_msa (JSAMPROW p_in_y, JSAMPROW p_in_cb, JSAMPROW p_in_cr, JSAMPROW p_rgb, JDIMENSION out_width) { int32_t y, cb, cr; uint32_t col, num_cols_mul_16 = out_width >> 4; uint32_t remaining_wd = out_width & 0xF; v16u8 mask_rgb0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; v16u8 mask_rgb1 = {11, 21, 12, 13, 22, 14, 15, 23, 0, 1, 24, 2, 3, 25, 4, 5}; v16u8 mask_rgb2 = {26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31}; v16u8 tmp0, tmp1, out0, out1, out2, input_y = {0}; v16i8 input_cb, input_cr, out_rgb0, out_rgb1, const_128 = __msa_ldi_b(128); v8i16 y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1; v4i32 cb_w0, cb_w1, cb_w2, cb_w3, cr_w0, cr_w1, cr_w2, cr_w3, zero = {0}; v16i8 out_r0, out_g0, out_b0; for (col = num_cols_mul_16; col--;) { input_y = LD_UB(p_in_y); input_cb = LD_SB(p_in_cb); input_cr = LD_SB(p_in_cr); p_in_y += 16; p_in_cb += 16; p_in_cr += 16; input_cb -= const_128; input_cr -= const_128; UNPCK_UB_SH(input_y, y_h0, y_h1); UNPCK_SB_SH(input_cb, cb_h0, cb_h1); UNPCK_SB_SH(input_cr, cr_h0, cr_h1); CALC_G4_FRM_YUV(y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1, out_g0); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); UNPCK_SH_SW(cr_h1, cr_w2, cr_w3); CALC_R4_FRM_YUV(y_h0, y_h1, cr_w0, cr_w1, cr_w2, cr_w3, out_r0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cb_h1, cb_w2, cb_w3); CALC_B4_FRM_YUV(y_h0, y_h1, cb_w0, cb_w1, cb_w2, cb_w3, out_b0); ILVRL_B2_SB(out_g0, out_b0, out_rgb0, out_rgb1); VSHF_B2_UB(out_rgb0, out_r0, out_rgb0, out_r0, mask_rgb0, mask_rgb1, out0, tmp0); VSHF_B2_UB(out_rgb1, out_r0, out_rgb1, out_r0, mask_rgb1, mask_rgb2, tmp1, out2); out1 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) tmp1, 8); out1 = (v16u8) __msa_pckev_d((v2i64) out1, (v2i64) tmp0); ST_UB(out0, p_rgb); p_rgb += 16; ST_UB(out1, p_rgb); p_rgb += 16; ST_UB(out2, p_rgb); p_rgb += 16; } if (remaining_wd >= 8) { uint64_t in_y, in_cb, in_cr; v16i8 input_cbcr = {0}; in_y = LD(p_in_y); in_cb = LD(p_in_cb); in_cr = LD(p_in_cr); p_in_y += 8; p_in_cb += 8; p_in_cr += 8; input_y = (v16u8) __msa_insert_d((v2i64) input_y, 0, in_y); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 0, in_cb); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 1, in_cr); input_cbcr -= const_128; y_h0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) input_y); UNPCK_SB_SH(input_cbcr, cb_h0, cr_h0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); CALC_R2_FRM_YUV(y_h0, cr_w0, cr_w1, out_r0); CALC_G2_FRM_YUV(y_h0, cb_h0, cr_h0, out_g0); CALC_B2_FRM_YUV(y_h0, cb_w0, cb_w1, out_b0); out_rgb0 = (v16i8) __msa_ilvr_b((v16i8) out_g0, (v16i8) out_b0); VSHF_B2_UB(out_rgb0, out_r0, out_rgb0, out_r0, mask_rgb0, mask_rgb1, out0, out1); ST_UB(out0, p_rgb); p_rgb += 16; ST8x1_UB(out1, p_rgb); p_rgb += 8; remaining_wd -= 8; } for (col = 0; col < remaining_wd; col++) { y = (int) (p_in_y[col]); cb = (int) (p_in_cb[col]) - 128; cr = (int) (p_in_cr[col]) - 128; /* Range-limiting is essential due to noise introduced by DCT losses. */ p_rgb[0] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_77200 * cb, 16)); p_rgb[1] = clip_pixel(y + ROUND_POWER_OF_TWO(((-FIX_0_34414) * cb - FIX_0_71414 * cr), 16)); p_rgb[2] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_40200 * cr, 16)); p_rgb += 3; } }
void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) { int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); output[0] = ROUND_POWER_OF_TWO(out, 6); }
void yuv_abgr_convert_msa (JSAMPROW p_in_y, JSAMPROW p_in_cb, JSAMPROW p_in_cr, JSAMPROW p_rgb, JDIMENSION out_width) { int y, cb, cr; unsigned int col, num_cols_mul_16 = out_width >> 4; unsigned int remaining_wd = out_width & 0xF; v16i8 alpha = __msa_ldi_b(0xFF); v16i8 const_128 = __msa_ldi_b(128); v16u8 out0, out1, out2, out3, input_y = {0}; v16i8 input_cb, input_cr, out_rgb0, out_rgb1, out_ab0, out_ab1; v8i16 y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1; v4i32 cb_w0, cb_w1, cb_w2, cb_w3, cr_w0, cr_w1, cr_w2, cr_w3, zero = {0}; v16i8 out_r0, out_g0, out_b0; for (col = num_cols_mul_16; col--;) { input_y = LD_UB(p_in_y); input_cb = LD_SB(p_in_cb); input_cr = LD_SB(p_in_cr); p_in_y += 16; p_in_cb += 16; p_in_cr += 16; input_cb -= const_128; input_cr -= const_128; UNPCK_UB_SH(input_y, y_h0, y_h1); UNPCK_SB_SH(input_cb, cb_h0, cb_h1); UNPCK_SB_SH(input_cr, cr_h0, cr_h1); CALC_G4_FRM_YUV(y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1, out_g0); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); UNPCK_SH_SW(cr_h1, cr_w2, cr_w3); CALC_R4_FRM_YUV(y_h0, y_h1, cr_w0, cr_w1, cr_w2, cr_w3, out_r0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cb_h1, cb_w2, cb_w3); CALC_B4_FRM_YUV(y_h0, y_h1, cb_w0, cb_w1, cb_w2, cb_w3, out_b0); ILVRL_B2_SB(out_r0, out_g0, out_rgb0, out_rgb1); ILVRL_B2_SB(out_b0, alpha, out_ab0, out_ab1); ILVRL_H2_UB(out_rgb0, out_ab0, out0, out1); ILVRL_H2_UB(out_rgb1, out_ab1, out2, out3); ST_UB4(out0, out1, out2, out3, p_rgb, 16); p_rgb += 16 * 4; } if (remaining_wd >= 8) { uint64_t in_y, in_cb, in_cr; v16i8 input_cbcr = {0}; in_y = LD(p_in_y); in_cb = LD(p_in_cb); in_cr = LD(p_in_cr); p_in_y += 8; p_in_cb += 8; p_in_cr += 8; input_y = (v16u8) __msa_insert_d((v2i64) input_y, 0, in_y); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 0, in_cb); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 1, in_cr); input_cbcr -= const_128; y_h0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) input_y); UNPCK_SB_SH(input_cbcr, cb_h0, cr_h0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); CALC_R2_FRM_YUV(y_h0, cr_w0, cr_w1, out_r0); CALC_G2_FRM_YUV(y_h0, cb_h0, cr_h0, out_g0); CALC_B2_FRM_YUV(y_h0, cb_w0, cb_w1, out_b0); out_rgb0 = (v16i8) __msa_ilvr_b((v16i8) out_r0, (v16i8) out_g0); out_ab0 = (v16i8) __msa_ilvr_b((v16i8) out_b0, alpha); ILVRL_H2_UB(out_rgb0, out_ab0, out0, out1); ST_UB2(out0, out1, p_rgb, 16); p_rgb += 16 * 2; remaining_wd -= 8; } for (col = 0; col < remaining_wd; col++) { y = (int) (p_in_y[col]); cb = (int) (p_in_cb[col]) - 128; cr = (int) (p_in_cr[col]) - 128; p_rgb[0] = 0xFF; p_rgb[1] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_77200 * cb, 16)); p_rgb[2] = clip_pixel(y + ROUND_POWER_OF_TWO(((-FIX_0_34414) * cb - FIX_0_71414 * cr), 16)); p_rgb[3] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_40200 * cr, 16)); p_rgb += 4; } }
/** * See av1_wedge_sse_from_residuals_c */ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N) { int n = -N; int n8 = n + 8; uint64_t csse; const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); assert(N % 64 == 0); r1 += N; d += N; m += N; do { const __m128i v_r0_w = xx_load_128(r1 + n); const __m128i v_r1_w = xx_load_128(r1 + n8); const __m128i v_d0_w = xx_load_128(d + n); const __m128i v_d1_w = xx_load_128(d + n8); const __m128i v_m01_b = xx_load_128(m + n); const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), _mm_srli_epi64(v_sq0_d, 32)); const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), _mm_srli_epi64(v_sq1_d, 32)); v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); n8 += 16; n += 16; } while (n); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); #if ARCH_X86_64 csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); #else xx_storel_64(&csse, v_acc0_q); #endif return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); }