void aom_idct4x4_1_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride) { int16_t out; v8i16 vec; out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS); out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS); out = ROUND_POWER_OF_TWO(out, 4); vec = __msa_fill_h(out); ADDBLK_ST4x4_UB(vec, vec, vec, vec, dst, dst_stride); }
void aom_idct4x4_16_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride) { v8i16 in0, in1, in2, in3; /* load vector elements of 4x4 block */ LD4x4_SH(input, in0, in1, in2, in3); /* rows */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* columns */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); AOM_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* rounding (add 2^3, divide by 2^4) */ SRARI_H4_SH(in0, in1, in2, in3, 4); ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); }
void vp9_iht4x4_16_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride, int32_t tx_type) { v8i16 in0, in1, in2, in3; /* load vector elements of 4x4 block */ LD4x4_SH(input, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); switch (tx_type) { case DCT_DCT: /* DCT in horizontal */ VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* DCT in vertical */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_DCT: /* DCT in horizontal */ VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* ADST in vertical */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); break; case DCT_ADST: /* ADST in horizontal */ VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* DCT in vertical */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_ADST: /* ADST in horizontal */ VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); /* ADST in vertical */ TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3); break; default: assert(0); break; } /* final rounding (add 2^3, divide by 2^4) and shift */ SRARI_H4_SH(in0, in1, in2, in3, 4); /* add block and store 4x4 */ ADDBLK_ST4x4_UB(in0, in1, in2, in3, dst, dst_stride); }
static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3; v8i16 hres0, hres1, hres2, hres3; v8i16 vres0, vres1, vres2, vres3; v8i16 zeros = { 0 }; LD4x4_SH( p_src, src0, src1, src2, src3 ); AVC_ITRANS_H( src0, src1, src2, src3, hres0, hres1, hres2, hres3 ); TRANSPOSE4x4_SH_SH( hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3 ); AVC_ITRANS_H( hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3 ); SRARI_H4_SH( vres0, vres1, vres2, vres3, 6 ); ADDBLK_ST4x4_UB( vres0, vres1, vres2, vres3, p_dst, i_dst_stride ); ST_SH2( zeros, zeros, p_src, 8 ); }
void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride) { int16_t a1, e1; v8i16 in1, in0 = { 0 }; a1 = input[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; in0 = __msa_insert_h(in0, 0, a1); in0 = __msa_insert_h(in0, 1, e1); in0 = __msa_insert_h(in0, 2, e1); in0 = __msa_insert_h(in0, 3, e1); in1 = in0 >> 1; in0 -= in1; ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride); }
void aom_iwht4x4_16_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride) { v8i16 in0, in1, in2, in3; v4i32 in0_r, in1_r, in2_r, in3_r, in4_r; /* load vector elements of 4x4 block */ LD4x4_SH(input, in0, in2, in3, in1); TRANSPOSE4x4_SH_SH(in0, in2, in3, in1, in0, in2, in3, in1); UNPCK_R_SH_SW(in0, in0_r); UNPCK_R_SH_SW(in2, in2_r); UNPCK_R_SH_SW(in3, in3_r); UNPCK_R_SH_SW(in1, in1_r); SRA_4V(in0_r, in1_r, in2_r, in3_r, UNIT_QUANT_SHIFT); in0_r += in2_r; in3_r -= in1_r; in4_r = (in0_r - in3_r) >> 1; in1_r = in4_r - in1_r; in2_r = in4_r - in2_r; in0_r -= in1_r; in3_r += in2_r; TRANSPOSE4x4_SW_SW(in0_r, in1_r, in2_r, in3_r, in0_r, in1_r, in2_r, in3_r); in0_r += in1_r; in2_r -= in3_r; in4_r = (in0_r - in2_r) >> 1; in3_r = in4_r - in3_r; in1_r = in4_r - in1_r; in0_r -= in3_r; in2_r += in1_r; PCKEV_H4_SH(in0_r, in0_r, in1_r, in1_r, in2_r, in2_r, in3_r, in3_r, in0, in1, in2, in3); ADDBLK_ST4x4_UB(in0, in3, in1, in2, dst, dst_stride); }