void vpx_fdct4x4_msa(const int16_t *input, int16_t *output, int32_t src_stride) { v8i16 in0, in1, in2, in3; LD_SH4(input, src_stride, in0, in1, in2, in3); /* fdct4 pre-process */ { v8i16 vec, mask; v16i8 zero = { 0 }; v16i8 one = __msa_ldi_b(1); mask = (v8i16)__msa_sldi_b(zero, one, 15); SLLI_4V(in0, in1, in2, in3, 4); vec = __msa_ceqi_h(in0, 0); vec = vec ^ 255; vec = mask & vec; in0 += vec; } VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); SRA_4V(in0, in1, in2, in3, 2); PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output, 8); }
static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { const v16u8 out = (v16u8)__msa_ldi_b(128); ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); dst += (8 * dst_stride); ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); }
static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { uint32_t out; const v16i8 store = __msa_ldi_b(128); out = __msa_copy_u_w((v4i32)store, 0); SW4(out, out, out, out, dst, dst_stride); }
static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { uint64_t out; const v16i8 store = __msa_ldi_b(128); out = __msa_copy_u_d((v2i64)store, 0); SD4(out, out, out, out, dst, dst_stride); dst += (4 * dst_stride); SD4(out, out, out, out, dst, dst_stride); }
static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { uint32_t row; const v16u8 out = (v16u8)__msa_ldi_b(128); for (row = 16; row--;) { ST_UB2(out, out, dst, 16); dst += dst_stride; ST_UB2(out, out, dst, 16); dst += dst_stride; } }
void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride, int32_t tx_type) { v8i16 in0, in1, in2, in3; LD_SH4(input, stride, in0, in1, in2, in3); /* fdct4 pre-process */ { v8i16 temp, mask; v16i8 zero = { 0 }; v16i8 one = __msa_ldi_b(1); mask = (v8i16)__msa_sldi_b(zero, one, 15); SLLI_4V(in0, in1, in2, in3, 4); temp = __msa_ceqi_h(in0, 0); temp = (v8i16)__msa_xori_b((v16u8)temp, 255); temp = mask & temp; in0 += temp; } switch (tx_type) { case DCT_DCT: VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_DCT: VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); break; case DCT_ADST: VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_ADST: VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); break; default: assert(0); break; } TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); SRA_4V(in0, in1, in2, in3, 2); PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output, 8); }
void aom_plane_add_noise_msa(uint8_t *start_ptr, char *noise, char blackclamp[16], char whiteclamp[16], char bothclamp[16], uint32_t width, uint32_t height, int32_t pitch) { uint32_t i, j; for (i = 0; i < height / 2; ++i) { uint8_t *pos0_ptr = start_ptr + (2 * i) * pitch; int8_t *ref0_ptr = (int8_t *)(noise + (rand() & 0xff)); uint8_t *pos1_ptr = start_ptr + (2 * i + 1) * pitch; int8_t *ref1_ptr = (int8_t *)(noise + (rand() & 0xff)); for (j = width / 16; j--;) { v16i8 temp00_s, temp01_s; v16u8 temp00, temp01, black_clamp, white_clamp; v16u8 pos0, ref0, pos1, ref1; v16i8 const127 = __msa_ldi_b(127); pos0 = LD_UB(pos0_ptr); ref0 = LD_UB(ref0_ptr); pos1 = LD_UB(pos1_ptr); ref1 = LD_UB(ref1_ptr); black_clamp = (v16u8)__msa_fill_b(blackclamp[0]); white_clamp = (v16u8)__msa_fill_b(whiteclamp[0]); temp00 = (pos0 < black_clamp); pos0 = __msa_bmnz_v(pos0, black_clamp, temp00); temp01 = (pos1 < black_clamp); pos1 = __msa_bmnz_v(pos1, black_clamp, temp01); XORI_B2_128_UB(pos0, pos1); temp00_s = __msa_adds_s_b((v16i8)white_clamp, const127); temp00 = (v16u8)(temp00_s < pos0); pos0 = (v16u8)__msa_bmnz_v((v16u8)pos0, (v16u8)temp00_s, temp00); temp01_s = __msa_adds_s_b((v16i8)white_clamp, const127); temp01 = (temp01_s < pos1); pos1 = (v16u8)__msa_bmnz_v((v16u8)pos1, (v16u8)temp01_s, temp01); XORI_B2_128_UB(pos0, pos1); pos0 += ref0; ST_UB(pos0, pos0_ptr); pos1 += ref1; ST_UB(pos1, pos1_ptr); pos0_ptr += 16; pos1_ptr += 16; ref0_ptr += 16; ref1_ptr += 16; } } }
void yuv_abgr_convert_msa (JSAMPROW p_in_y, JSAMPROW p_in_cb, JSAMPROW p_in_cr, JSAMPROW p_rgb, JDIMENSION out_width) { int y, cb, cr; unsigned int col, num_cols_mul_16 = out_width >> 4; unsigned int remaining_wd = out_width & 0xF; v16i8 alpha = __msa_ldi_b(0xFF); v16i8 const_128 = __msa_ldi_b(128); v16u8 out0, out1, out2, out3, input_y = {0}; v16i8 input_cb, input_cr, out_rgb0, out_rgb1, out_ab0, out_ab1; v8i16 y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1; v4i32 cb_w0, cb_w1, cb_w2, cb_w3, cr_w0, cr_w1, cr_w2, cr_w3, zero = {0}; v16i8 out_r0, out_g0, out_b0; for (col = num_cols_mul_16; col--;) { input_y = LD_UB(p_in_y); input_cb = LD_SB(p_in_cb); input_cr = LD_SB(p_in_cr); p_in_y += 16; p_in_cb += 16; p_in_cr += 16; input_cb -= const_128; input_cr -= const_128; UNPCK_UB_SH(input_y, y_h0, y_h1); UNPCK_SB_SH(input_cb, cb_h0, cb_h1); UNPCK_SB_SH(input_cr, cr_h0, cr_h1); CALC_G4_FRM_YUV(y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1, out_g0); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); UNPCK_SH_SW(cr_h1, cr_w2, cr_w3); CALC_R4_FRM_YUV(y_h0, y_h1, cr_w0, cr_w1, cr_w2, cr_w3, out_r0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cb_h1, cb_w2, cb_w3); CALC_B4_FRM_YUV(y_h0, y_h1, cb_w0, cb_w1, cb_w2, cb_w3, out_b0); ILVRL_B2_SB(out_r0, out_g0, out_rgb0, out_rgb1); ILVRL_B2_SB(out_b0, alpha, out_ab0, out_ab1); ILVRL_H2_UB(out_rgb0, out_ab0, out0, out1); ILVRL_H2_UB(out_rgb1, out_ab1, out2, out3); ST_UB4(out0, out1, out2, out3, p_rgb, 16); p_rgb += 16 * 4; } if (remaining_wd >= 8) { uint64_t in_y, in_cb, in_cr; v16i8 input_cbcr = {0}; in_y = LD(p_in_y); in_cb = LD(p_in_cb); in_cr = LD(p_in_cr); p_in_y += 8; p_in_cb += 8; p_in_cr += 8; input_y = (v16u8) __msa_insert_d((v2i64) input_y, 0, in_y); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 0, in_cb); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 1, in_cr); input_cbcr -= const_128; y_h0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) input_y); UNPCK_SB_SH(input_cbcr, cb_h0, cr_h0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); CALC_R2_FRM_YUV(y_h0, cr_w0, cr_w1, out_r0); CALC_G2_FRM_YUV(y_h0, cb_h0, cr_h0, out_g0); CALC_B2_FRM_YUV(y_h0, cb_w0, cb_w1, out_b0); out_rgb0 = (v16i8) __msa_ilvr_b((v16i8) out_r0, (v16i8) out_g0); out_ab0 = (v16i8) __msa_ilvr_b((v16i8) out_b0, alpha); ILVRL_H2_UB(out_rgb0, out_ab0, out0, out1); ST_UB2(out0, out1, p_rgb, 16); p_rgb += 16 * 2; remaining_wd -= 8; } for (col = 0; col < remaining_wd; col++) { y = (int) (p_in_y[col]); cb = (int) (p_in_cb[col]) - 128; cr = (int) (p_in_cr[col]) - 128; p_rgb[0] = 0xFF; p_rgb[1] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_77200 * cb, 16)); p_rgb[2] = clip_pixel(y + ROUND_POWER_OF_TWO(((-FIX_0_34414) * cb - FIX_0_71414 * cr), 16)); p_rgb[3] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_40200 * cr, 16)); p_rgb += 4; } }
void yuv_bgr_convert_msa (JSAMPROW p_in_y, JSAMPROW p_in_cb, JSAMPROW p_in_cr, JSAMPROW p_rgb, JDIMENSION out_width) { int32_t y, cb, cr; uint32_t col, num_cols_mul_16 = out_width >> 4; uint32_t remaining_wd = out_width & 0xF; v16u8 mask_rgb0 = {0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, 8, 9, 20, 10}; v16u8 mask_rgb1 = {11, 21, 12, 13, 22, 14, 15, 23, 0, 1, 24, 2, 3, 25, 4, 5}; v16u8 mask_rgb2 = {26, 6, 7, 27, 8, 9, 28, 10, 11, 29, 12, 13, 30, 14, 15, 31}; v16u8 tmp0, tmp1, out0, out1, out2, input_y = {0}; v16i8 input_cb, input_cr, out_rgb0, out_rgb1, const_128 = __msa_ldi_b(128); v8i16 y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1; v4i32 cb_w0, cb_w1, cb_w2, cb_w3, cr_w0, cr_w1, cr_w2, cr_w3, zero = {0}; v16i8 out_r0, out_g0, out_b0; for (col = num_cols_mul_16; col--;) { input_y = LD_UB(p_in_y); input_cb = LD_SB(p_in_cb); input_cr = LD_SB(p_in_cr); p_in_y += 16; p_in_cb += 16; p_in_cr += 16; input_cb -= const_128; input_cr -= const_128; UNPCK_UB_SH(input_y, y_h0, y_h1); UNPCK_SB_SH(input_cb, cb_h0, cb_h1); UNPCK_SB_SH(input_cr, cr_h0, cr_h1); CALC_G4_FRM_YUV(y_h0, y_h1, cb_h0, cb_h1, cr_h0, cr_h1, out_g0); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); UNPCK_SH_SW(cr_h1, cr_w2, cr_w3); CALC_R4_FRM_YUV(y_h0, y_h1, cr_w0, cr_w1, cr_w2, cr_w3, out_r0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cb_h1, cb_w2, cb_w3); CALC_B4_FRM_YUV(y_h0, y_h1, cb_w0, cb_w1, cb_w2, cb_w3, out_b0); ILVRL_B2_SB(out_g0, out_b0, out_rgb0, out_rgb1); VSHF_B2_UB(out_rgb0, out_r0, out_rgb0, out_r0, mask_rgb0, mask_rgb1, out0, tmp0); VSHF_B2_UB(out_rgb1, out_r0, out_rgb1, out_r0, mask_rgb1, mask_rgb2, tmp1, out2); out1 = (v16u8) __msa_sldi_b((v16i8) zero, (v16i8) tmp1, 8); out1 = (v16u8) __msa_pckev_d((v2i64) out1, (v2i64) tmp0); ST_UB(out0, p_rgb); p_rgb += 16; ST_UB(out1, p_rgb); p_rgb += 16; ST_UB(out2, p_rgb); p_rgb += 16; } if (remaining_wd >= 8) { uint64_t in_y, in_cb, in_cr; v16i8 input_cbcr = {0}; in_y = LD(p_in_y); in_cb = LD(p_in_cb); in_cr = LD(p_in_cr); p_in_y += 8; p_in_cb += 8; p_in_cr += 8; input_y = (v16u8) __msa_insert_d((v2i64) input_y, 0, in_y); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 0, in_cb); input_cbcr = (v16i8) __msa_insert_d((v2i64) input_cbcr, 1, in_cr); input_cbcr -= const_128; y_h0 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) input_y); UNPCK_SB_SH(input_cbcr, cb_h0, cr_h0); UNPCK_SH_SW(cb_h0, cb_w0, cb_w1); UNPCK_SH_SW(cr_h0, cr_w0, cr_w1); CALC_R2_FRM_YUV(y_h0, cr_w0, cr_w1, out_r0); CALC_G2_FRM_YUV(y_h0, cb_h0, cr_h0, out_g0); CALC_B2_FRM_YUV(y_h0, cb_w0, cb_w1, out_b0); out_rgb0 = (v16i8) __msa_ilvr_b((v16i8) out_g0, (v16i8) out_b0); VSHF_B2_UB(out_rgb0, out_r0, out_rgb0, out_r0, mask_rgb0, mask_rgb1, out0, out1); ST_UB(out0, p_rgb); p_rgb += 16; ST8x1_UB(out1, p_rgb); p_rgb += 8; remaining_wd -= 8; } for (col = 0; col < remaining_wd; col++) { y = (int) (p_in_y[col]); cb = (int) (p_in_cb[col]) - 128; cr = (int) (p_in_cr[col]) - 128; /* Range-limiting is essential due to noise introduced by DCT losses. */ p_rgb[0] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_77200 * cb, 16)); p_rgb[1] = clip_pixel(y + ROUND_POWER_OF_TWO(((-FIX_0_34414) * cb - FIX_0_71414 * cr), 16)); p_rgb[2] = clip_pixel(y + ROUND_POWER_OF_TWO(FIX_1_40200 * cr, 16)); p_rgb += 3; } }
int32_t vp8_mbblock_error_msa(MACROBLOCK *mb, int32_t dc) { BLOCK *be; BLOCKD *bd; int16_t *coeff_ptr, *dq_coeff_ptr; int32_t err = 0; uint32_t loop_cnt; v8i16 coeff, coeff0, coeff1, coeff2, coeff3, coeff4; v8i16 dq_coeff, dq_coeff2, dq_coeff3, dq_coeff4; v4i32 diff0, diff1; v2i64 err0, err1; v16u8 zero = { 0 }; v16u8 mask0 = (v16u8)__msa_ldi_b(255); if (1 == dc) { mask0 = (v16u8)__msa_insve_w((v4i32)mask0, 0, (v4i32)zero); } for (loop_cnt = 0; loop_cnt < 8; loop_cnt++) { be = &mb->block[2 * loop_cnt]; bd = &mb->e_mbd.block[2 * loop_cnt]; coeff_ptr = be->coeff; dq_coeff_ptr = bd->dqcoeff; coeff = LD_SH(coeff_ptr); dq_coeff = LD_SH(dq_coeff_ptr); coeff_ptr += 8; dq_coeff_ptr += 8; coeff2 = LD_SH(coeff_ptr); dq_coeff2 = LD_SH(dq_coeff_ptr); be = &mb->block[2 * loop_cnt + 1]; bd = &mb->e_mbd.block[2 * loop_cnt + 1]; coeff_ptr = be->coeff; dq_coeff_ptr = bd->dqcoeff; coeff3 = LD_SH(coeff_ptr); dq_coeff3 = LD_SH(dq_coeff_ptr); coeff_ptr += 8; dq_coeff_ptr += 8; coeff4 = LD_SH(coeff_ptr); dq_coeff4 = LD_SH(dq_coeff_ptr); ILVRL_H2_SH(coeff, dq_coeff, coeff0, coeff1); HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0); DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1); ILVRL_H2_SH(coeff2, dq_coeff2, coeff0, coeff1); HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); DPADD_SD2_SD(diff0, diff1, err0, err1); err0 += __msa_splati_d(err0, 1); err1 += __msa_splati_d(err1, 1); err += __msa_copy_s_d(err0, 0); err += __msa_copy_s_d(err1, 0); ILVRL_H2_SH(coeff3, dq_coeff3, coeff0, coeff1); HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); diff0 = (v4i32)__msa_bmnz_v(zero, (v16u8)diff0, mask0); DOTP_SW2_SD(diff0, diff1, diff0, diff1, err0, err1); ILVRL_H2_SH(coeff4, dq_coeff4, coeff0, coeff1); HSUB_UH2_SW(coeff0, coeff1, diff0, diff1); DPADD_SD2_SD(diff0, diff1, err0, err1); err0 += __msa_splati_d(err0, 1); err1 += __msa_splati_d(err1, 1); err += __msa_copy_s_d(err0, 0); err += __msa_copy_s_d(err1, 0); } return err; }