void aom_iwht4x4_1_add_msa(const int16_t *input, uint8_t *dst, int32_t dst_stride) { int16_t a1, e1; v8i16 in1, in0 = { 0 }; a1 = input[0] >> UNIT_QUANT_SHIFT; e1 = a1 >> 1; a1 -= e1; in0 = __msa_insert_h(in0, 0, a1); in0 = __msa_insert_h(in0, 1, e1); in0 = __msa_insert_h(in0, 2, e1); in0 = __msa_insert_h(in0, 3, e1); in1 = in0 >> 1; in0 -= in1; ADDBLK_ST4x4_UB(in0, in1, in1, in1, dst, dst_stride); }
static void FTransformWHT(const int16_t* in, int16_t* out) { v8i16 in0 = { 0 }; v8i16 in1 = { 0 }; v8i16 tmp0, tmp1, tmp2, tmp3; v8i16 out0, out1; const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; in0 = __msa_insert_h(in0, 0, in[ 0]); in0 = __msa_insert_h(in0, 1, in[ 64]); in0 = __msa_insert_h(in0, 2, in[128]); in0 = __msa_insert_h(in0, 3, in[192]); in0 = __msa_insert_h(in0, 4, in[ 16]); in0 = __msa_insert_h(in0, 5, in[ 80]); in0 = __msa_insert_h(in0, 6, in[144]); in0 = __msa_insert_h(in0, 7, in[208]); in1 = __msa_insert_h(in1, 0, in[ 48]); in1 = __msa_insert_h(in1, 1, in[112]); in1 = __msa_insert_h(in1, 2, in[176]); in1 = __msa_insert_h(in1, 3, in[240]); in1 = __msa_insert_h(in1, 4, in[ 32]); in1 = __msa_insert_h(in1, 5, in[ 96]); in1 = __msa_insert_h(in1, 6, in[160]); in1 = __msa_insert_h(in1, 7, in[224]); ADDSUB2(in0, in1, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); ADDSUB2(tmp2, tmp3, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1); ADDSUB2(in0, in1, tmp0, tmp1); VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); ADDSUB2(tmp2, tmp3, out0, out1); SRAI_H2_SH(out0, out1, 1); ST_SH2(out0, out1, out, 8); }