void vpx_fdct4x4_msa(const int16_t *input, int16_t *output, int32_t src_stride) { v8i16 in0, in1, in2, in3; LD_SH4(input, src_stride, in0, in1, in2, in3); /* fdct4 pre-process */ { v8i16 vec, mask; v16i8 zero = { 0 }; v16i8 one = __msa_ldi_b(1); mask = (v8i16)__msa_sldi_b(zero, one, 15); SLLI_4V(in0, in1, in2, in3, 4); vec = __msa_ceqi_h(in0, 0); vec = vec ^ 255; vec = mask & vec; in0 += vec; } VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); SRA_4V(in0, in1, in2, in3, 2); PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output, 8); }
void vp9_fwht4x4_msa(const int16_t *input, int16_t *output, int32_t src_stride) { v8i16 in0, in1, in2, in3, in4; LD_SH4(input, src_stride, in0, in1, in2, in3); in0 += in1; in3 -= in2; in4 = (in0 - in3) >> 1; SUB2(in4, in1, in4, in2, in1, in2); in0 -= in2; in3 += in1; TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); in0 += in2; in1 -= in3; in4 = (in0 - in1) >> 1; SUB2(in4, in2, in4, in3, in2, in3); in0 -= in3; in1 += in2; SLLI_4V(in0, in1, in2, in3, 2); TRANSPOSE4x4_SH_SH(in0, in3, in1, in2, in0, in3, in1, in2); ST4x2_UB(in0, output, 4); ST4x2_UB(in3, output + 4, 4); ST4x2_UB(in1, output + 8, 4); ST4x2_UB(in2, output + 12, 4); }
void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride, int32_t tx_type) { v8i16 in0, in1, in2, in3; LD_SH4(input, stride, in0, in1, in2, in3); /* fdct4 pre-process */ { v8i16 temp, mask; v16i8 zero = { 0 }; v16i8 one = __msa_ldi_b(1); mask = (v8i16)__msa_sldi_b(zero, one, 15); SLLI_4V(in0, in1, in2, in3, 4); temp = __msa_ceqi_h(in0, 0); temp = (v8i16)__msa_xori_b((v16u8)temp, 255); temp = mask & temp; in0 += temp; } switch (tx_type) { case DCT_DCT: VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_DCT: VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); break; case DCT_ADST: VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_ADST: VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); break; default: assert(0); break; } TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); SRA_4V(in0, in1, in2, in3, 2); PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output, 8); }
void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0_h, in1_h, in2_h, in3_h; v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3; LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h); TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h); UNPCK_R_SH_SW(in0_h, in0_w); UNPCK_R_SH_SW(in1_h, in1_w); UNPCK_R_SH_SW(in2_h, in2_w); UNPCK_R_SH_SW(in3_h, in3_w); BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); SLLI_4V(temp0, temp1, temp2, temp3, 2); BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); temp0 = RET_1_IF_NZERO_W(temp0); in0_w += temp0; TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w); BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); in0_w += RET_1_IF_NEG_W(in0_w); in1_w += RET_1_IF_NEG_W(in1_w); in2_w += RET_1_IF_NEG_W(in2_w); in3_w += RET_1_IF_NEG_W(in3_w); ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w); SRA_4V(in0_w, in1_w, in2_w, in3_w, 3); PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h); ST_SH2(in0_h, in1_h, output, 8); }
void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride) { v8i16 in0, in1, in2, in3; /* load vector elements of 4x4 block */ LD4x4_SH(input, in0, in1, in2, in3); /* rows */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* columns */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* rounding (add 2^3, divide by 2^4) */ SRARI_H4_SH(in0, in1, in2, in3, 4); ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); }
void vp9_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride, int32_t tx_type) { v8i16 in0, in1, in2, in3; /* load vector elements of 4x4 block */ LD4x4_SH(input, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); switch (tx_type) { case DCT_DCT: /* DCT in horizontal */ VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* DCT in vertical */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_DCT: /* DCT in horizontal */ VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* ADST in vertical */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); break; case DCT_ADST: /* ADST in horizontal */ VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* DCT in vertical */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_ADST: /* ADST in horizontal */ VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* ADST in vertical */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); break; default: assert(0); break; } /* final rounding (add 2^3, divide by 2^4) and shift */ SRARI_H4_SH(in0, in1, in2, in3, 4); /* add block and store 4x4 */ ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); }
void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0, in1, in2, in3; v8i16 temp0, temp1; v8i16 const0, const1; v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 }; v4i32 out0, out1, out2, out3; v8i16 zero = { 0 }; LD_SH4(input, pitch / 2, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); SLLI_4V(temp0, temp1, in1, in3, 3); in0 = temp0 + temp1; in2 = temp0 - temp1; SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1); temp0 = __msa_ilvr_h(in3, in1); in1 = __msa_splati_h(coeff, 3); out0 = (v4i32)__msa_ilvev_h(zero, in1); coeff = __msa_ilvl_h(zero, coeff); out1 = __msa_splati_w((v4i32)coeff, 0); DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1); out0 >>= 12; out1 >>= 12; PCKEV_H2_SH(out0, out0, out1, out1, in1, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3); in0 = temp0 + temp1 + 7; in2 = temp0 - temp1 + 7; in0 >>= 4; in2 >>= 4; ILVR_H2_SW(zero, in0, zero, in2, out0, out2); temp1 = RET_1_IF_NZERO_H(in3); ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0); SPLATI_W2_SW(coeff, 2, out3, out1); out3 += out1; out1 = __msa_splati_w((v4i32)coeff, 1); DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3); out1 >>= 16; out3 >>= 16; out1 += (v4i32)temp1; PCKEV_H2_SH(out1, out0, out3, out2, in0, in2); ST_SH2(in0, in2, output, 8); }
static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3; v8i16 hres0, hres1, hres2, hres3; v8i16 vres0, vres1, vres2, vres3; v8i16 zeros = { 0 }; LD4x4_SH( p_src, src0, src1, src2, src3 ); AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 ); TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3 ); AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 ); SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 ); ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride ); ST_SH2( zeros, zeros, p_src, 8 ); }
static void avc_sub4x4_dct_msa( uint8_t *p_src, int32_t i_src_stride, uint8_t *p_ref, int32_t i_dst_stride, int16_t *p_dst ) { uint32_t i_src0, i_src1, i_src2, i_src3; uint32_t i_ref0, i_ref1, i_ref2, i_ref3; v16i8 src = { 0 }; v16i8 ref = { 0 }; v16u8 inp0, inp1; v8i16 diff0, diff1, diff2, diff3; v8i16 temp0, temp1, temp2, temp3; LW4( p_src, i_src_stride, i_src0, i_src1, i_src2, i_src3 ); LW4( p_ref, i_dst_stride, i_ref0, i_ref1, i_ref2, i_ref3 ); INSERT_W4_SB( i_src0, i_src1, i_src2, i_src3, src ); INSERT_W4_SB( i_ref0, i_ref1, i_ref2, i_ref3, ref ); ILVRL_B2_UB( src, ref, inp0, inp1 ); HSUB_UB2_SH( inp0, inp1, diff0, diff2 ); diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 ); diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 ); BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); diff0 = temp0 + temp1; diff1 = ( temp3 << 1 ) + temp2; diff2 = temp0 - temp1; diff3 = temp3 - ( temp2 << 1 ); TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3 ); BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3 ); temp0 = diff0 + diff1; temp1 = ( diff3 << 1 ) + diff2; temp2 = diff0 - diff1; temp3 = diff3 - ( diff2 << 1 ); ILVR_D2_UB( temp1, temp0, temp3, temp2, inp0, inp1 ); ST_UB2( inp0, inp1, p_dst, 8 ); }
void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride) { v8i16 in0, in1, in2, in3; v4i32 in0_r, in1_r, in2_r, in3_r, in4_r; /* load vector elements of 4x4 block */ LD4x4_SH(input, in0, in2, in3, in1); TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); UNPCK_R_SH_SW(in0, in0_r); UNPCK_R_SH_SW(in2, in2_r); UNPCK_R_SH_SW(in3, in3_r); UNPCK_R_SH_SW(in1, in1_r); SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT); in0_r += in2_r; in3_r -= in1_r; in4_r = (in0_r - in3_r) >> 1; in1_r = in4_r - in1_r; in2_r = in4_r - in2_r; in0_r -= in1_r; in3_r += in2_r; TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r); in0_r += in1_r; in2_r -= in3_r; in4_r = (in0_r - in2_r) >> 1; in3_r = in4_r - in3_r; in1_r = in4_r - in1_r; in0_r -= in3_r; in2_r += in1_r; PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1, in2, in3); ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); }