static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { uint32_t row; v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b; v16u8 frame_l, frame_h; v16i8 zero = { 0 }; v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; v8i16 diff0, diff1, cnt0, cnt1; v4i32 const3, const16, filter_wt, strength; v4i32 mod0_w, mod1_w, mod2_w, mod3_w; v4i32 diff0_r, diff0_l, diff1_r, diff1_l; v4i32 frame2_0, frame2_1, frame2_2, frame2_3; v4i32 acc0, acc1, acc2, acc3; filter_wt = __msa_fill_w(filter_wt_in); strength = __msa_fill_w(strength_in); const3 = __msa_ldi_w(3); const16 = __msa_ldi_w(16); for (row = 8; row--;) { frame1_0_b = LD_SB(frame1_ptr); frame2_0_b = LD_SB(frame2_ptr); frame1_ptr += stride; frame2_ptr += 16; frame1_1_b = LD_SB(frame1_ptr); frame2_1_b = LD_SB(frame2_ptr); LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h) ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; frame1_ptr += stride; frame2_ptr += 16; } }
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 }; LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in8, in9, in10, in11, 2); SLLI_4V(in12, in13, in14, in15, 2); ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); tmp_ptr += 16; /* stp 1 */ ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); cnst4 = __msa_splati_h(coeff, 0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); cnst5 = __msa_splati_h(coeff, 1); cnst5 = __msa_ilvev_h(cnst5, cnst4); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); /* stp2 */ BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); cnst0 = __msa_splati_h(coeff, 4); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); ILVRL_H2_SH(in15, in8, vec1, vec0); SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr); cnst0 = __msa_splati_h(coeff2, 0); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 224); ILVRL_H2_SH(in14, in9, vec1, vec0); SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 128); cnst1 = __msa_splati_h(coeff2, 2); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 96); SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); cnst1 = __msa_splati_h(coeff, 3); cnst1 = __msa_ilvev_h(cnst0, cnst1); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); /* stp4 */ ADD2(stp34, stp25, stp33, stp22, in13, in10); ILVRL_H2_SH(in13, in10, vec1, vec0); SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 64); cnst0 = __msa_splati_h(coeff2, 1); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 160); SUB2(stp34, stp25, stp33, stp22, in12, in11); ILVRL_H2_SH(in12, in11, vec1, vec0); SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 192); cnst1 = __msa_splati_h(coeff2, 3); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 32); }
static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, int16_t *out) { v8i16 in16, in17, in18, in19, in20, in21, in22, in23; v8i16 in24, in25, in26, in27, in28, in29, in30, in31; v8i16 vec4, vec5; in20 = LD_SH(temp + 32); in21 = LD_SH(temp + 40); in26 = LD_SH(temp + 80); in27 = LD_SH(temp + 88); DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); FDCT_POSTPROC_2V_NEG_H(in20, in21); FDCT_POSTPROC_2V_NEG_H(in26, in27); in18 = LD_SH(temp + 16); in19 = LD_SH(temp + 24); in28 = LD_SH(temp + 96); in29 = LD_SH(temp + 104); FDCT_POSTPROC_2V_NEG_H(in18, in19); FDCT_POSTPROC_2V_NEG_H(in28, in29); vec4 = in19 - in20; ST_SH(vec4, interm_ptr + 32); vec4 = in18 - in21; ST_SH(vec4, interm_ptr + 88); vec4 = in29 - in26; ST_SH(vec4, interm_ptr + 64); vec4 = in28 - in27; ST_SH(vec4, interm_ptr + 56); ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); in22 = LD_SH(temp + 48); in23 = LD_SH(temp + 56); in24 = LD_SH(temp + 64); in25 = LD_SH(temp + 72); DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); FDCT_POSTPROC_2V_NEG_H(in22, in23); FDCT_POSTPROC_2V_NEG_H(in24, in25); in16 = LD_SH(temp); in17 = LD_SH(temp + 8); in30 = LD_SH(temp + 112); in31 = LD_SH(temp + 120); FDCT_POSTPROC_2V_NEG_H(in16, in17); FDCT_POSTPROC_2V_NEG_H(in30, in31); vec4 = in17 - in22; ST_SH(vec4, interm_ptr + 40); vec4 = in30 - in25; ST_SH(vec4, interm_ptr + 48); vec4 = in31 - in24; ST_SH(vec4, interm_ptr + 72); vec4 = in16 - in23; ST_SH(vec4, interm_ptr + 80); ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); ADD2(in27, in26, in25, in24, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); ST_SH(vec5, out); ST_SH(vec4, out + 120); SUB2(in27, in26, in25, in24, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); ST_SH(vec5, out + 112); ST_SH(vec4, out + 8); SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); SUB2(in26, in27, in24, in25, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); ST_SH(vec4, out + 16); ST_SH(vec5, out + 104); ADD2(in26, in27, in24, in25, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); ST_SH(vec4, out + 24); ST_SH(vec5, out + 96); in20 = LD_SH(interm_ptr + 32); in21 = LD_SH(interm_ptr + 88); in27 = LD_SH(interm_ptr + 56); in26 = LD_SH(interm_ptr + 64); in16 = in20; in17 = in21; DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); in22 = LD_SH(interm_ptr + 40); in25 = LD_SH(interm_ptr + 48); in24 = LD_SH(interm_ptr + 72); in23 = LD_SH(interm_ptr + 80); SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); in16 = in28 + in29; in19 = in31 + in30; DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); ST_SH(vec5, out + 32); ST_SH(vec4, out + 88); SUB2(in28, in29, in31, in30, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); ST_SH(vec5, out + 40); ST_SH(vec4, out + 80); ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); SUB2(in29, in28, in30, in31, in16, in19); DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); ST_SH(vec5, out + 72); ST_SH(vec4, out + 48); ADD2(in29, in28, in30, in31, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); ST_SH(vec4, out + 56); ST_SH(vec5, out + 64); }
static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { uint32_t row; uint64_t f0, f1, f2, f3, f4, f5, f6, f7; v16i8 frame1 = { 0 }; v16i8 frame2 = { 0 }; v16i8 frame3 = { 0 }; v16i8 frame4 = { 0 }; v16u8 frame_l, frame_h; v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; v8i16 diff0, diff1, cnt0, cnt1; v4i32 const3, const16; v4i32 filter_wt, strength; v4i32 mod0_w, mod1_w, mod2_w, mod3_w; v4i32 diff0_r, diff0_l, diff1_r, diff1_l; v4i32 frame2_0, frame2_1, frame2_2, frame2_3; v4i32 acc0, acc1, acc2, acc3; filter_wt = __msa_fill_w(filter_wt_in); strength = __msa_fill_w(strength_in); const3 = __msa_ldi_w(3); const16 = __msa_ldi_w(16); for (row = 2; row--;) { LD2(frame1_ptr, stride, f0, f1); frame1_ptr += (2 * stride); LD2(frame2_ptr, 8, f2, f3); frame2_ptr += 16; LD2(frame1_ptr, stride, f4, f5); frame1_ptr += (2 * stride); LD2(frame2_ptr, 8, f6, f7); frame2_ptr += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); INSERT_D2_SB(f0, f1, frame1); INSERT_D2_SB(f2, f3, frame2); INSERT_D2_SB(f4, f5, frame3); INSERT_D2_SB(f6, f7, frame4); ILVRL_B2_UB(frame1, frame2, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame3, frame4, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; } }
static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; /* fdct32 even */ /* stage 2 */ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); FDCT_POSTPROC_2V_NEG_H(vec0, vec1); FDCT_POSTPROC_2V_NEG_H(vec2, vec3); FDCT_POSTPROC_2V_NEG_H(vec4, vec5); FDCT_POSTPROC_2V_NEG_H(vec6, vec7); FDCT_POSTPROC_2V_NEG_H(in8, in9); FDCT_POSTPROC_2V_NEG_H(in10, in11); FDCT_POSTPROC_2V_NEG_H(in12, in13); FDCT_POSTPROC_2V_NEG_H(in14, in15); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); temp0 = in0 + in3; in0 = in0 - in3; in3 = in1 + in2; in1 = in1 - in2; DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); ST_SH(temp0, out); ST_SH(temp1, out + 8); DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); ST_SH(temp0, out + 16); ST_SH(temp1, out + 24); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); ST_SH(temp0, out + 32); ST_SH(temp1, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); ST_SH(temp0, out + 40); ST_SH(temp1, out + 48); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); ST_SH(temp0, out + 64); ST_SH(temp1, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); ST_SH(temp0, out + 72); ST_SH(temp1, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); ST_SH(temp0, out + 80); ST_SH(temp1, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); ST_SH(temp0, out + 96); ST_SH(temp1, out + 88); }
static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 temp0, temp1; /* fdct even */ LD_SH4(input, 8, in0, in1, in2, in3); LD_SH4(input + 96, 8, in12, in13, in14, in15); BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, vec3, in12, in13, in14, in15); LD_SH4(input + 32, 8, in4, in5, in6, in7); LD_SH4(input + 64, 8, in8, in9, in10, in11); BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, in8, in9, in10, in11); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp); ST_SH(temp1, temp + 512); DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 256); ST_SH(temp1, temp + 768); SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 128); ST_SH(temp1, temp + 896); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 640); ST_SH(temp1, temp + 384); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 64); ST_SH(temp1, temp + 960); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 576); ST_SH(temp1, temp + 448); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 320); ST_SH(temp1, temp + 704); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 192); ST_SH(temp1, temp + 832); }
static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; /* fdct32 even */ /* stage 2 */ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); /* Stage 3 */ UNPCK_SH_SW(vec0, vec0_l, vec0_r); UNPCK_SH_SW(vec1, vec1_l, vec1_r); UNPCK_SH_SW(vec2, vec2_l, vec2_r); UNPCK_SH_SW(vec3, vec3_l, vec3_r); UNPCK_SH_SW(vec4, vec4_l, vec4_r); UNPCK_SH_SW(vec5, vec5_l, vec5_r); UNPCK_SH_SW(vec6, vec6_l, vec6_r); UNPCK_SH_SW(vec7, vec7_l, vec7_r); ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); tmp3_w = vec0_r + vec3_r; vec0_r = vec0_r - vec3_r; vec3_r = vec1_r + vec2_r; vec1_r = vec1_r - vec2_r; DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out, 8); DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out + 16, 8); LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 32); ST_SH(in5, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 40); ST_SH(in5, out + 48); LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 64); ST_SH(in5, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 72); ST_SH(in5, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 80); ST_SH(in5, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 96); ST_SH(in5, out + 88); }
static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { v8i16 in16, in17, in18, in19, in20, in21, in22, in23; v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; in20 = LD_SH(input + 32); in21 = LD_SH(input + 40); in26 = LD_SH(input + 80); in27 = LD_SH(input + 88); DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); in18 = LD_SH(input + 16); in19 = LD_SH(input + 24); in28 = LD_SH(input + 96); in29 = LD_SH(input + 104); vec4 = in19 - in20; ST_SH(vec4, input + 32); vec4 = in18 - in21; ST_SH(vec4, input + 40); vec4 = in29 - in26; ST_SH(vec4, input + 80); vec4 = in28 - in27; ST_SH(vec4, input + 88); in21 = in18 + in21; in20 = in19 + in20; in27 = in28 + in27; in26 = in29 + in26; LD_SH4(input + 48, 8, in22, in23, in24, in25); DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); in16 = LD_SH(input); in17 = LD_SH(input + 8); in30 = LD_SH(input + 112); in31 = LD_SH(input + 120); vec4 = in17 - in22; ST_SH(vec4, input + 16); vec4 = in16 - in23; ST_SH(vec4, input + 24); vec4 = in31 - in24; ST_SH(vec4, input + 96); vec4 = in30 - in25; ST_SH(vec4, input + 104); ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); ADD2(in27, in26, in25, in24, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr); ST_SH(vec4, temp_ptr + 960); SUB2(in27, in26, in25, in24, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 448); ST_SH(vec4, temp_ptr + 512); SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); SUB2(in26, in27, in24, in25, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec4, temp_ptr + 704); ST_SH(vec5, temp_ptr + 256); ADD2(in26, in27, in24, in25, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec4, temp_ptr + 192); ST_SH(vec5, temp_ptr + 768); LD_SH4(input + 16, 8, in22, in23, in20, in21); LD_SH4(input + 80, 8, in26, in27, in24, in25); in16 = in20; in17 = in21; DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); ADD2(in28, in29, in31, in30, in16, in19); DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 832); ST_SH(vec4, temp_ptr + 128); SUB2(in28, in29, in31, in30, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 320); ST_SH(vec4, temp_ptr + 640); ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); SUB2(in29, in28, in30, in31, in16, in19); DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 576); ST_SH(vec4, temp_ptr + 384); ADD2(in29, in28, in30, in31, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 64); ST_SH(vec4, temp_ptr + 896); }
static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias ) { int32_t non_zero = 0; v8i16 dct0, dct1, dct2, dct3; v8i16 zero = { 0 }; v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask; v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3; v8i16 bias_h0, bias_h1, bias_h2, bias_h3; v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7; v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3; v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7; v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3; v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7; v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7; LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); dct2_mask = __msa_clei_s_h( dct2, 0 ); dct3_mask = __msa_clei_s_h( dct3, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias0, bias2, bias4, bias6 ); ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias1, bias3, bias5, bias7 ); LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; dct_w4 *= mf_vec4; dct_w5 *= mf_vec5; dct_w6 *= mf_vec6; dct_w7 *= mf_vec7; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6, dct_h0, dct_h1, dct_h2, dct_h3 ); SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, dct0, dct1, dct2, dct3 ); dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 ); LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 ); dct0_mask = __msa_clei_s_h( dct0, 0 ); dct1_mask = __msa_clei_s_h( dct1, 0 ); dct2_mask = __msa_clei_s_h( dct2, 0 ); dct3_mask = __msa_clei_s_h( dct3, 0 ); UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 ); UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 ); UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 ); UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 ); LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 ); ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias0, bias2, bias4, bias6 ); ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3, bias1, bias3, bias5, bias7 ); LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 ); ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec0, mf_vec2, mf_vec4, mf_vec6 ); ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3, mf_vec1, mf_vec3, mf_vec5, mf_vec7 ); dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 ); dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 ); dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 ); dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 ); dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 ); dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 ); dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 ); dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 ); dct_w0 *= mf_vec0; dct_w1 *= mf_vec1; dct_w2 *= mf_vec2; dct_w3 *= mf_vec3; dct_w4 *= mf_vec4; dct_w5 *= mf_vec5; dct_w6 *= mf_vec6; dct_w7 *= mf_vec7; SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 ); SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 ); PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 ); PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 ); SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3, dct0, dct1, dct2, dct3 ); dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0, ( v16u8 ) dct0_mask ); dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1, ( v16u8 ) dct1_mask ); dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2, ( v16u8 ) dct2, ( v16u8 ) dct2_mask ); dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3, ( v16u8 ) dct3, ( v16u8 ) dct3_mask ); non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) ); ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 ); return !!non_zero; }