void fdct16x8_1d_row(int16_t *input, int16_t *output) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); SRA_4V(in0, in1, in2, in3, 2); SRA_4V(in4, in5, in6, in7, 2); SRA_4V(in8, in9, in10, in11, 2); SRA_4V(in12, in13, in14, in15, 2); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); }
void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch) { v8i16 in0_h, in1_h, in2_h, in3_h; v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3; LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h); TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h); UNPCK_R_SH_SW(in0_h, in0_w); UNPCK_R_SH_SW(in1_h, in1_w); UNPCK_R_SH_SW(in2_h, in2_w); UNPCK_R_SH_SW(in3_h, in3_w); BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); SLLI_4V(temp0, temp1, temp2, temp3, 2); BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); temp0 = RET_1_IF_NZERO_W(temp0); in0_w += temp0; TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w); BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1); BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w); in0_w += RET_1_IF_NEG_W(in0_w); in1_w += RET_1_IF_NEG_W(in1_w); in2_w += RET_1_IF_NEG_W(in2_w); in3_w += RET_1_IF_NEG_W(in3_w); ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w); SRA_4V(in0_w, in1_w, in2_w, in3_w, 3); PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h); ST_SH2(in0_h, in1_h, output, 8); }
void vpx_fdct4x4_msa(const int16_t *input, int16_t *output, int32_t src_stride) { v8i16 in0, in1, in2, in3; LD_SH4(input, src_stride, in0, in1, in2, in3); /* fdct4 pre-process */ { v8i16 vec, mask; v16i8 zero = { 0 }; v16i8 one = __msa_ldi_b(1); mask = (v8i16)__msa_sldi_b(zero, one, 15); SLLI_4V(in0, in1, in2, in3, 4); vec = __msa_ceqi_h(in0, 0); vec = vec ^ 255; vec = mask & vec; in0 += vec; } VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); SRA_4V(in0, in1, in2, in3, 2); PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output, 8); }
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in, uint8_t* dst) { v8i16 input0, input1; v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; v4i32 res0, res1, res2, res3; v16i8 dest0, dest1, dest2, dest3; const v16i8 zero = { 0 }; LD_SH2(in, 8, input0, input1); UNPCK_SH_SW(input0, in0, in1); UNPCK_SH_SW(input1, in2, in3); IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3); TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3); SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); LD_SB4(ref, BPS, dest0, dest1, dest2, dest3); ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1, res2, res3); ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1, res2, res3); ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); CLIP_SW4_0_255(res0, res1, res2, res3); PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); }
void vpx_fdct8x8_1_msa(const int16_t *input, int16_t *out, int32_t stride) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v4i32 vec_w; LD_SH8(input, stride, in0, in1, in2, in3, in4, in5, in6, in7); ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); ADD2(in0, in2, in4, in6, in0, in4); vec_w = __msa_hadd_s_w(in0, in0); vec_w += __msa_hadd_s_w(in4, in4); out[0] = HADD_SW_S32(vec_w); out[1] = 0; }
void vp9_fht4x4_msa(const int16_t *input, int16_t *output, int32_t stride, int32_t tx_type) { v8i16 in0, in1, in2, in3; LD_SH4(input, stride, in0, in1, in2, in3); /* fdct4 pre-process */ { v8i16 temp, mask; v16i8 zero = { 0 }; v16i8 one = __msa_ldi_b(1); mask = (v8i16)__msa_sldi_b(zero, one, 15); SLLI_4V(in0, in1, in2, in3, 4); temp = __msa_ceqi_h(in0, 0); temp = (v8i16)__msa_xori_b((v16u8)temp, 255); temp = mask & temp; in0 += temp; } switch (tx_type) { case DCT_DCT: VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_DCT: VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); break; case DCT_ADST: VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); break; case ADST_ADST: VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); VP9_FADST4(in0, in1, in2, in3, in0, in1, in2, in3); break; default: assert(0); break; } TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); SRA_4V(in0, in1, in2, in3, 2); PCKEV_D2_SH(in1, in0, in3, in2, in0, in2); ST_SH2(in0, in2, output, 8); }
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left, const uint8_t* top) { if (left != NULL) { if (top != NULL) { int j; v8i16 d1, d2; const v16i8 zero = { 0 }; const v8i16 TL = (v8i16)__msa_fill_h(left[-1]); const v16u8 T = LD_UB(top); ILVRL_B2_SH(zero, T, d1, d2); SUB2(d1, TL, d2, TL, d1, d2); for (j = 0; j < 16; j += 4) { v16i8 t0, t1, t2, t3; v8i16 r0, r1, r2, r3, r4, r5, r6, r7; const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]); const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]); const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]); const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]); ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3); ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7); CLIP_SH4_0_255(r0, r1, r2, r3); CLIP_SH4_0_255(r4, r5, r6, r7); PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3); ST_SB4(t0, t1, t2, t3, dst, BPS); dst += 4 * BPS; } } else { HorizontalPred16x16(dst, left); } } else { if (top != NULL) { VerticalPred16x16(dst, top); } else { const v16u8 out = (v16u8)__msa_fill_b(0x81); STORE16x16(out, dst); } } }
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride) { uint8_t *temp_dst = dst; uint64_t dst0, dst1, dst2, dst3; v2i64 dst_vec0 = { 0 }; v2i64 dst_vec1 = { 0 }; v8i16 dst_r0, dst_l0, dst_r1, dst_l1; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v16u8 zeros = { 0 }; LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD4(temp_dst, stride, dst0, dst1, dst2, dst3); temp_dst += (4 * stride); INSERT_D2_SD(dst0, dst1, dst_vec0); INSERT_D2_SD(dst2, dst3, dst_vec1); ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0); ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1); ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3, dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); ST8x4_UB(dst_r0, dst_r1, dst, stride); dst += (4 * stride); LD4(temp_dst, stride, dst0, dst1, dst2, dst3); INSERT_D2_SD(dst0, dst1, dst_vec0); INSERT_D2_SD(dst2, dst3, dst_vec1); UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0); UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1); ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7, dst_r0, dst_l0, dst_r1, dst_l1); CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1); PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1); ST8x4_UB(dst_r0, dst_r1, dst, stride); }
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) { const v16i8 zero = { 0 }; const v8i16 TL = (v8i16)__msa_fill_h(top[-1]); const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]); const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]); const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]); const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]); const v16u8 T1 = LD_UB(top); const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1); const v8i16 d = T - TL; v8i16 r0, r1, r2, r3; ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3); CLIP_SH4_0_255(r0, r1, r2, r3); PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS); }
uint32_t vp10_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { uint32_t sum_out; v16u8 src0, src1, src2, src3, src4, src5, src6, src7; v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; v4u32 sum = { 0 }; LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); ADD2(sum0, sum2, sum4, sum6, sum0, sum4); sum0 += sum4; sum = __msa_hadd_u_w(sum0, sum0); sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); sum = __msa_hadd_u_w(sum0, sum0); sum = (v4u32)__msa_srari_w((v4i32)sum, 6); sum_out = __msa_copy_u_w((v4i32)sum, 0); return sum_out; }
void vpx_fdct16x16_1_msa(const int16_t *input, int16_t *out, int32_t stride) { int sum, i; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v4i32 vec_w = { 0 }; for (i = 0; i < 4; ++i) { LD_SH2(input, 8, in0, in1); input += stride; LD_SH2(input, 8, in2, in3); input += stride; LD_SH2(input, 8, in4, in5); input += stride; LD_SH2(input, 8, in6, in7); input += stride; ADD4(in0, in1, in2, in3, in4, in5, in6, in7, in0, in2, in4, in6); ADD2(in0, in2, in4, in6, in0, in4); vec_w += __msa_hadd_s_w(in0, in0); vec_w += __msa_hadd_s_w(in4, in4); } sum = HADD_SW_S32(vec_w); out[0] = (int16_t)(sum >> 1); }
static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) { v8i16 in16, in17, in18, in19, in20, in21, in22, in23; v8i16 in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5; in20 = LD_SH(input + 32); in21 = LD_SH(input + 40); in26 = LD_SH(input + 80); in27 = LD_SH(input + 88); DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); in18 = LD_SH(input + 16); in19 = LD_SH(input + 24); in28 = LD_SH(input + 96); in29 = LD_SH(input + 104); vec4 = in19 - in20; ST_SH(vec4, input + 32); vec4 = in18 - in21; ST_SH(vec4, input + 40); vec4 = in29 - in26; ST_SH(vec4, input + 80); vec4 = in28 - in27; ST_SH(vec4, input + 88); in21 = in18 + in21; in20 = in19 + in20; in27 = in28 + in27; in26 = in29 + in26; LD_SH4(input + 48, 8, in22, in23, in24, in25); DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); in16 = LD_SH(input); in17 = LD_SH(input + 8); in30 = LD_SH(input + 112); in31 = LD_SH(input + 120); vec4 = in17 - in22; ST_SH(vec4, input + 16); vec4 = in16 - in23; ST_SH(vec4, input + 24); vec4 = in31 - in24; ST_SH(vec4, input + 96); vec4 = in30 - in25; ST_SH(vec4, input + 104); ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); ADD2(in27, in26, in25, in24, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr); ST_SH(vec4, temp_ptr + 960); SUB2(in27, in26, in25, in24, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 448); ST_SH(vec4, temp_ptr + 512); SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); SUB2(in26, in27, in24, in25, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec4, temp_ptr + 704); ST_SH(vec5, temp_ptr + 256); ADD2(in26, in27, in24, in25, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec4, temp_ptr + 192); ST_SH(vec5, temp_ptr + 768); LD_SH4(input + 16, 8, in22, in23, in20, in21); LD_SH4(input + 80, 8, in26, in27, in24, in25); in16 = in20; in17 = in21; DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); ADD2(in28, in29, in31, in30, in16, in19); DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 832); ST_SH(vec4, temp_ptr + 128); SUB2(in28, in29, in31, in30, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 320); ST_SH(vec4, temp_ptr + 640); ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); SUB2(in29, in28, in30, in31, in16, in19); DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 576); ST_SH(vec4, temp_ptr + 384); ADD2(in29, in28, in30, in31, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); FDCT32_POSTPROC_2V_POS_H(vec5, vec4); ST_SH(vec5, temp_ptr + 64); ST_SH(vec4, temp_ptr + 896); }
static void temporal_filter_apply_16size_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { uint32_t row; v16i8 frame1_0_b, frame1_1_b, frame2_0_b, frame2_1_b; v16u8 frame_l, frame_h; v16i8 zero = { 0 }; v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; v8i16 diff0, diff1, cnt0, cnt1; v4i32 const3, const16, filter_wt, strength; v4i32 mod0_w, mod1_w, mod2_w, mod3_w; v4i32 diff0_r, diff0_l, diff1_r, diff1_l; v4i32 frame2_0, frame2_1, frame2_2, frame2_3; v4i32 acc0, acc1, acc2, acc3; filter_wt = __msa_fill_w(filter_wt_in); strength = __msa_fill_w(strength_in); const3 = __msa_ldi_w(3); const16 = __msa_ldi_w(16); for (row = 8; row--;) { frame1_0_b = LD_SB(frame1_ptr); frame2_0_b = LD_SB(frame2_ptr); frame1_ptr += stride; frame2_ptr += 16; frame1_1_b = LD_SB(frame1_ptr); frame2_1_b = LD_SB(frame2_ptr); LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame1_0_b, frame2_0_b, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h) ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; ILVRL_B2_SH(zero, frame2_0_b, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame1_1_b, frame2_1_b, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame2_1_b, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; frame1_ptr += stride; frame2_ptr += 16; } }
static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v8i16 temp0, temp1; /* fdct even */ LD_SH4(input, 8, in0, in1, in2, in3); LD_SH4(input + 96, 8, in12, in13, in14, in15); BUTTERFLY_8(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1, vec2, vec3, in12, in13, in14, in15); LD_SH4(input + 32, 8, in4, in5, in6, in7); LD_SH4(input + 64, 8, in8, in9, in10, in11); BUTTERFLY_8(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6, vec7, in8, in9, in10, in11); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); BUTTERFLY_4(in0, in1, in2, in3, temp0, in4, in1, in0); DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp); ST_SH(temp1, temp + 512); DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 256); ST_SH(temp1, temp + 768); SUB4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7, vec6, vec5, vec4); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 128); ST_SH(temp1, temp + 896); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 640); ST_SH(temp1, temp + 384); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 64); ST_SH(temp1, temp + 960); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 576); ST_SH(temp1, temp + 448); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 320); ST_SH(temp1, temp + 704); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); FDCT32_POSTPROC_2V_POS_H(temp0, temp1); ST_SH(temp0, temp + 192); ST_SH(temp1, temp + 832); }
static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7; v4i32 vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r; v4i32 tmp0_w, tmp1_w, tmp2_w, tmp3_w; /* fdct32 even */ /* stage 2 */ LD_SH8(input, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(input + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, interm_ptr, 8); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, interm_ptr + 64, 8); /* Stage 3 */ UNPCK_SH_SW(vec0, vec0_l, vec0_r); UNPCK_SH_SW(vec1, vec1_l, vec1_r); UNPCK_SH_SW(vec2, vec2_l, vec2_r); UNPCK_SH_SW(vec3, vec3_l, vec3_r); UNPCK_SH_SW(vec4, vec4_l, vec4_r); UNPCK_SH_SW(vec5, vec5_l, vec5_r); UNPCK_SH_SW(vec6, vec6_l, vec6_r); UNPCK_SH_SW(vec7, vec7_l, vec7_r); ADD4(vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r, vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w); BUTTERFLY_4(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r, vec5_r); ADD4(vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l, vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r); tmp3_w = vec0_r + vec3_r; vec0_r = vec0_r - vec3_r; vec3_r = vec1_r + vec2_r; vec1_r = vec1_r - vec2_r; DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out, 8); DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64, vec4_r, tmp3_w, vec6_r, vec3_r); FDCT32_POSTPROC_NEG_W(vec4_r); FDCT32_POSTPROC_NEG_W(tmp3_w); FDCT32_POSTPROC_NEG_W(vec6_r); FDCT32_POSTPROC_NEG_W(vec3_r); PCKEV_H2_SH(vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5); ST_SH2(vec5, vec4, out + 16, 8); LD_SH8(interm_ptr, 8, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 32); ST_SH(in5, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 40); ST_SH(in5, out + 48); LD_SH8(interm_ptr + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 64); ST_SH(in5, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 72); ST_SH(in5, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 80); ST_SH(in5, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5); FDCT_POSTPROC_2V_NEG_H(in4, in5); ST_SH(in4, out + 96); ST_SH(in5, out + 88); }
void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr, int32_t src_stride) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 stp21, stp22, stp23, stp24, stp25, stp26, stp30; v8i16 stp31, stp32, stp33, stp34, stp35, stp36, stp37; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5; v8i16 coeff = { cospi_16_64, -cospi_16_64, cospi_8_64, cospi_24_64, -cospi_8_64, -cospi_24_64, cospi_12_64, cospi_20_64 }; v8i16 coeff1 = { cospi_2_64, cospi_30_64, cospi_14_64, cospi_18_64, cospi_10_64, cospi_22_64, cospi_6_64, cospi_26_64 }; v8i16 coeff2 = { -cospi_2_64, -cospi_10_64, -cospi_18_64, -cospi_26_64, 0, 0, 0, 0 }; LD_SH16(input, src_stride, in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15); SLLI_4V(in0, in1, in2, in3, 2); SLLI_4V(in4, in5, in6, in7, 2); SLLI_4V(in8, in9, in10, in11, 2); SLLI_4V(in12, in13, in14, in15, 2); ADD4(in0, in15, in1, in14, in2, in13, in3, in12, tmp0, tmp1, tmp2, tmp3); ADD4(in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5, tmp6, tmp7); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); ST_SH8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp_ptr, 32); SUB4(in0, in15, in1, in14, in2, in13, in3, in12, in15, in14, in13, in12); SUB4(in4, in11, in5, in10, in6, in9, in7, in8, in11, in10, in9, in8); tmp_ptr += 16; /* stp 1 */ ILVL_H2_SH(in10, in13, in11, in12, vec2, vec4); ILVR_H2_SH(in10, in13, in11, in12, vec3, vec5); cnst4 = __msa_splati_h(coeff, 0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4); cnst5 = __msa_splati_h(coeff, 1); cnst5 = __msa_ilvev_h(cnst5, cnst4); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5); stp24 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4); stp23 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5); /* stp2 */ BUTTERFLY_4(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33); BUTTERFLY_4(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34); ILVL_H2_SH(stp36, stp31, stp35, stp32, vec2, vec4); ILVR_H2_SH(stp36, stp31, stp35, stp32, vec3, vec5); SPLATI_H2_SH(coeff, 2, 3, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); stp26 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0); cnst0 = __msa_splati_h(coeff, 4); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp21 = DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1); BUTTERFLY_4(stp30, stp37, stp26, stp21, in8, in15, in14, in9); ILVRL_H2_SH(in15, in8, vec1, vec0); SPLATI_H2_SH(coeff1, 0, 1, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr); cnst0 = __msa_splati_h(coeff2, 0); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 224); ILVRL_H2_SH(in14, in9, vec1, vec0); SPLATI_H2_SH(coeff1, 2, 3, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 128); cnst1 = __msa_splati_h(coeff2, 2); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 96); SPLATI_H2_SH(coeff, 2, 5, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); stp25 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); cnst1 = __msa_splati_h(coeff, 3); cnst1 = __msa_ilvev_h(cnst0, cnst1); stp22 = DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1); /* stp4 */ ADD2(stp34, stp25, stp33, stp22, in13, in10); ILVRL_H2_SH(in13, in10, vec1, vec0); SPLATI_H2_SH(coeff1, 4, 5, cnst0, cnst1); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 64); cnst0 = __msa_splati_h(coeff2, 1); cnst0 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 160); SUB2(stp34, stp25, stp33, stp22, in12, in11); ILVRL_H2_SH(in12, in11, vec1, vec0); SPLATI_H2_SH(coeff1, 6, 7, cnst0, cnst1); cnst1 = __msa_ilvev_h(cnst1, cnst0); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1); ST_SH(in8, tmp_ptr + 192); cnst1 = __msa_splati_h(coeff2, 3); cnst0 = __msa_ilvev_h(cnst0, cnst1); in8 = DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0); ST_SH(in8, tmp_ptr + 32); }
static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) { v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1; /* fdct32 even */ /* stage 2 */ LD_SH8(temp, 8, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8(temp + 64, 8, in8, in9, in10, in11, in12, in13, in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14, in15); FDCT_POSTPROC_2V_NEG_H(vec0, vec1); FDCT_POSTPROC_2V_NEG_H(vec2, vec3); FDCT_POSTPROC_2V_NEG_H(vec4, vec5); FDCT_POSTPROC_2V_NEG_H(vec6, vec7); FDCT_POSTPROC_2V_NEG_H(in8, in9); FDCT_POSTPROC_2V_NEG_H(in10, in11); FDCT_POSTPROC_2V_NEG_H(in12, in13); FDCT_POSTPROC_2V_NEG_H(in14, in15); /* Stage 3 */ ADD4(vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0, in1, in2, in3); temp0 = in0 + in3; in0 = in0 - in3; in3 = in1 + in2; in1 = in1 - in2; DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0); ST_SH(temp0, out); ST_SH(temp1, out + 8); DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0); ST_SH(temp0, out + 16); ST_SH(temp1, out + 24); SUB4(vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4, vec5, vec6, vec7); DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6); ADD2(vec4, vec5, vec7, vec6, vec0, vec1); DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0); ST_SH(temp0, out + 32); ST_SH(temp1, out + 56); SUB2(vec4, vec5, vec7, vec6, vec4, vec7); DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0); ST_SH(temp0, out + 40); ST_SH(temp1, out + 48); DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5); DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4); ADD4(in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0, vec1, vec6, in2); DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3); ADD2(in0, in1, in2, in3, vec0, vec7); DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0); ST_SH(temp0, out + 64); ST_SH(temp1, out + 120); SUB2(in0, in1, in2, in3, in0, in2); DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0); ST_SH(temp0, out + 72); ST_SH(temp1, out + 112); SUB2(in9, vec2, in14, vec5, vec2, vec5); DOTP_CONST_PAIR((-vec2), vec5, cospi_24_64, cospi_8_64, in2, in1); SUB4(in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0, vec2, vec5); DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0); ST_SH(temp0, out + 80); ST_SH(temp1, out + 104); ADD2(in3, in2, in0, in1, vec3, vec4); DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1); ST_SH(temp0, out + 96); ST_SH(temp1, out + 88); }
static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src, int32_t i_dst_stride ) { v8i16 src0, src1, src2, src3, src4, src5, src6, src7; v8i16 vec0, vec1, vec2, vec3; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 res0, res1, res2, res3, res4, res5, res6, res7; v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r; v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l; v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l; v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r; v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l; v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; v16i8 zeros = { 0 }; p_src[ 0 ] += 32; LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 ); vec0 = src0 + src4; vec1 = src0 - src4; vec2 = src2 >> 1; vec2 = vec2 - src6; vec3 = src6 >> 1; vec3 = src2 + vec3; BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 ); vec0 = src7 >> 1; vec0 = src5 - vec0 - src3 - src7; vec1 = src3 >> 1; vec1 = src1 - vec1 + src7 - src3; vec2 = src5 >> 1; vec2 = vec2 - src1 + src7 + src5; vec3 = src1 >> 1; vec3 = vec3 + src3 + src5 + src1; tmp4 = vec3 >> 2; tmp4 += vec0; tmp5 = vec2 >> 2; tmp5 += vec1; tmp6 = vec1 >> 2; tmp6 -= vec2; tmp7 = vec0 >> 2; tmp7 = vec3 - tmp7; BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, res0, res1, res2, res3, res4, res5, res6, res7 ); TRANSPOSE8x8_SH_SH( res0, res1, res2, res3, res4, res5, res6, res7, res0, res1, res2, res3, res4, res5, res6, res7 ); UNPCK_SH_SW( res0, tmp0_r, tmp0_l ); UNPCK_SH_SW( res1, tmp1_r, tmp1_l ); UNPCK_SH_SW( res2, tmp2_r, tmp2_l ); UNPCK_SH_SW( res3, tmp3_r, tmp3_l ); UNPCK_SH_SW( res4, tmp4_r, tmp4_l ); UNPCK_SH_SW( res5, tmp5_r, tmp5_l ); UNPCK_SH_SW( res6, tmp6_r, tmp6_l ); UNPCK_SH_SW( res7, tmp7_r, tmp7_l ); BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r ); vec2_r = tmp2_r >> 1; vec2_l = tmp2_l >> 1; vec2_r -= tmp6_r; vec2_l -= tmp6_l; vec3_r = tmp6_r >> 1; vec3_l = tmp6_l >> 1; vec3_r += tmp2_r; vec3_l += tmp2_l; BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r ); BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l ); vec0_r = tmp7_r >> 1; vec0_l = tmp7_l >> 1; vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r; vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l; vec1_r = tmp3_r >> 1; vec1_l = tmp3_l >> 1; vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r; vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l; vec2_r = tmp5_r >> 1; vec2_l = tmp5_l >> 1; vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r; vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l; vec3_r = tmp1_r >> 1; vec3_l = tmp1_l >> 1; vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r; vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l; tmp1_r = vec3_r >> 2; tmp1_l = vec3_l >> 2; tmp1_r += vec0_r; tmp1_l += vec0_l; tmp3_r = vec2_r >> 2; tmp3_l = vec2_l >> 2; tmp3_r += vec1_r; tmp3_l += vec1_l; tmp5_r = vec1_r >> 2; tmp5_l = vec1_l >> 2; tmp5_r -= vec2_r; tmp5_l -= vec2_l; tmp7_r = vec0_r >> 2; tmp7_l = vec0_l >> 2; tmp7_r = vec3_r - tmp7_r; tmp7_l = vec3_l - tmp7_l; BUTTERFLY_4( tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r ); BUTTERFLY_4( tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r ); BUTTERFLY_4( tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r ); BUTTERFLY_4( tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r ); SRA_4V( res0_r, res0_l, res1_r, res1_l, 6 ); SRA_4V( res2_r, res2_l, res3_r, res3_l, 6 ); SRA_4V( res4_r, res4_l, res5_r, res5_l, 6 ); SRA_4V( res6_r, res6_l, res7_r, res7_l, 6 ); PCKEV_H4_SH( res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r, res0, res1, res2, res3 ); PCKEV_H4_SH( res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r, res4, res5, res6, res7 ); LD_SB8( p_dst, i_dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 ); ILVR_B4_SH( zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3, tmp0, tmp1, tmp2, tmp3 ); ILVR_B4_SH( zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7, tmp4, tmp5, tmp6, tmp7 ); ADD4( res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3, res0, res1, res2, res3 ); ADD4( res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7, res4, res5, res6, res7 ); CLIP_SH4_0_255( res0, res1, res2, res3 ); CLIP_SH4_0_255( res4, res5, res6, res7 ); PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6, dst0, dst1, dst2, dst3 ); ST8x4_UB( dst0, dst1, p_dst, i_dst_stride ); p_dst += ( 4 * i_dst_stride ); ST8x4_UB( dst2, dst3, p_dst, i_dst_stride ); }
static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr, int16_t *out) { v8i16 in16, in17, in18, in19, in20, in21, in22, in23; v8i16 in24, in25, in26, in27, in28, in29, in30, in31; v8i16 vec4, vec5; in20 = LD_SH(temp + 32); in21 = LD_SH(temp + 40); in26 = LD_SH(temp + 80); in27 = LD_SH(temp + 88); DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27); DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26); FDCT_POSTPROC_2V_NEG_H(in20, in21); FDCT_POSTPROC_2V_NEG_H(in26, in27); in18 = LD_SH(temp + 16); in19 = LD_SH(temp + 24); in28 = LD_SH(temp + 96); in29 = LD_SH(temp + 104); FDCT_POSTPROC_2V_NEG_H(in18, in19); FDCT_POSTPROC_2V_NEG_H(in28, in29); vec4 = in19 - in20; ST_SH(vec4, interm_ptr + 32); vec4 = in18 - in21; ST_SH(vec4, interm_ptr + 88); vec4 = in29 - in26; ST_SH(vec4, interm_ptr + 64); vec4 = in28 - in27; ST_SH(vec4, interm_ptr + 56); ADD4(in18, in21, in19, in20, in28, in27, in29, in26, in21, in20, in27, in26); in22 = LD_SH(temp + 48); in23 = LD_SH(temp + 56); in24 = LD_SH(temp + 64); in25 = LD_SH(temp + 72); DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25); DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24); FDCT_POSTPROC_2V_NEG_H(in22, in23); FDCT_POSTPROC_2V_NEG_H(in24, in25); in16 = LD_SH(temp); in17 = LD_SH(temp + 8); in30 = LD_SH(temp + 112); in31 = LD_SH(temp + 120); FDCT_POSTPROC_2V_NEG_H(in16, in17); FDCT_POSTPROC_2V_NEG_H(in30, in31); vec4 = in17 - in22; ST_SH(vec4, interm_ptr + 40); vec4 = in30 - in25; ST_SH(vec4, interm_ptr + 48); vec4 = in31 - in24; ST_SH(vec4, interm_ptr + 72); vec4 = in16 - in23; ST_SH(vec4, interm_ptr + 80); ADD4(in16, in23, in17, in22, in30, in25, in31, in24, in16, in17, in30, in31); DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29); DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28); ADD4(in16, in19, in17, in18, in30, in29, in31, in28, in27, in22, in21, in25); DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24); ADD2(in27, in26, in25, in24, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5); ST_SH(vec5, out); ST_SH(vec4, out + 120); SUB2(in27, in26, in25, in24, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4); ST_SH(vec5, out + 112); ST_SH(vec4, out + 8); SUB4(in17, in18, in16, in19, in31, in28, in30, in29, in23, in26, in24, in20); DOTP_CONST_PAIR((-in23), in20, cospi_28_64, cospi_4_64, in27, in25); SUB2(in26, in27, in24, in25, in23, in20); DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5); ST_SH(vec4, out + 16); ST_SH(vec5, out + 104); ADD2(in26, in27, in24, in25, in22, in21); DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5); ST_SH(vec4, out + 24); ST_SH(vec5, out + 96); in20 = LD_SH(interm_ptr + 32); in21 = LD_SH(interm_ptr + 88); in27 = LD_SH(interm_ptr + 56); in26 = LD_SH(interm_ptr + 64); in16 = in20; in17 = in21; DOTP_CONST_PAIR(-in16, in27, cospi_24_64, cospi_8_64, in20, in27); DOTP_CONST_PAIR(-in17, in26, cospi_24_64, cospi_8_64, in21, in26); in22 = LD_SH(interm_ptr + 40); in25 = LD_SH(interm_ptr + 48); in24 = LD_SH(interm_ptr + 72); in23 = LD_SH(interm_ptr + 80); SUB4(in23, in20, in22, in21, in25, in26, in24, in27, in28, in17, in18, in31); DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30); in16 = in28 + in29; in19 = in31 + in30; DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4); ST_SH(vec5, out + 32); ST_SH(vec4, out + 88); SUB2(in28, in29, in31, in30, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4); ST_SH(vec5, out + 40); ST_SH(vec4, out + 80); ADD4(in22, in21, in23, in20, in24, in27, in25, in26, in16, in29, in30, in19); DOTP_CONST_PAIR(-in16, in19, cospi_12_64, cospi_20_64, in28, in31); SUB2(in29, in28, in30, in31, in16, in19); DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4); ST_SH(vec5, out + 72); ST_SH(vec4, out + 48); ADD2(in29, in28, in30, in31, in17, in18); DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4); ST_SH(vec4, out + 56); ST_SH(vec5, out + 64); }
static void temporal_filter_apply_8size_msa(uint8_t *frame1_ptr, uint32_t stride, uint8_t *frame2_ptr, int32_t strength_in, int32_t filter_wt_in, uint32_t *acc, uint16_t *cnt) { uint32_t row; uint64_t f0, f1, f2, f3, f4, f5, f6, f7; v16i8 frame1 = { 0 }; v16i8 frame2 = { 0 }; v16i8 frame3 = { 0 }; v16i8 frame4 = { 0 }; v16u8 frame_l, frame_h; v8i16 frame2_0_h, frame2_1_h, mod0_h, mod1_h; v8i16 diff0, diff1, cnt0, cnt1; v4i32 const3, const16; v4i32 filter_wt, strength; v4i32 mod0_w, mod1_w, mod2_w, mod3_w; v4i32 diff0_r, diff0_l, diff1_r, diff1_l; v4i32 frame2_0, frame2_1, frame2_2, frame2_3; v4i32 acc0, acc1, acc2, acc3; filter_wt = __msa_fill_w(filter_wt_in); strength = __msa_fill_w(strength_in); const3 = __msa_ldi_w(3); const16 = __msa_ldi_w(16); for (row = 2; row--;) { LD2(frame1_ptr, stride, f0, f1); frame1_ptr += (2 * stride); LD2(frame2_ptr, 8, f2, f3); frame2_ptr += 16; LD2(frame1_ptr, stride, f4, f5); frame1_ptr += (2 * stride); LD2(frame2_ptr, 8, f6, f7); frame2_ptr += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); INSERT_D2_SB(f0, f1, frame1); INSERT_D2_SB(f2, f3, frame2); INSERT_D2_SB(f4, f5, frame3); INSERT_D2_SB(f6, f7, frame4); ILVRL_B2_UB(frame1, frame2, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame2, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; LD_SW2(acc, 4, acc0, acc1); LD_SW2(acc + 8, 4, acc2, acc3); LD_SH2(cnt, 8, cnt0, cnt1); ILVRL_B2_UB(frame3, frame4, frame_l, frame_h); HSUB_UB2_SH(frame_l, frame_h, diff0, diff1); UNPCK_SH_SW(diff0, diff0_r, diff0_l); UNPCK_SH_SW(diff1, diff1_r, diff1_l); MUL4(diff0_r, diff0_r, diff0_l, diff0_l, diff1_r, diff1_r, diff1_l, diff1_l, mod0_w, mod1_w, mod2_w, mod3_w); MUL4(mod0_w, const3, mod1_w, const3, mod2_w, const3, mod3_w, const3, mod0_w, mod1_w, mod2_w, mod3_w); SRAR_W4_SW(mod0_w, mod1_w, mod2_w, mod3_w, strength); diff0_r = (mod0_w < const16); diff0_l = (mod1_w < const16); diff1_r = (mod2_w < const16); diff1_l = (mod3_w < const16); SUB4(const16, mod0_w, const16, mod1_w, const16, mod2_w, const16, mod3_w, mod0_w, mod1_w, mod2_w, mod3_w); mod0_w = diff0_r & mod0_w; mod1_w = diff0_l & mod1_w; mod2_w = diff1_r & mod2_w; mod3_w = diff1_l & mod3_w; MUL4(mod0_w, filter_wt, mod1_w, filter_wt, mod2_w, filter_wt, mod3_w, filter_wt, mod0_w, mod1_w, mod2_w, mod3_w); PCKEV_H2_SH(mod1_w, mod0_w, mod3_w, mod2_w, mod0_h, mod1_h); ADD2(mod0_h, cnt0, mod1_h, cnt1, mod0_h, mod1_h); ST_SH2(mod0_h, mod1_h, cnt, 8); cnt += 16; UNPCK_UB_SH(frame4, frame2_0_h, frame2_1_h); UNPCK_SH_SW(frame2_0_h, frame2_0, frame2_1); UNPCK_SH_SW(frame2_1_h, frame2_2, frame2_3); MUL4(mod0_w, frame2_0, mod1_w, frame2_1, mod2_w, frame2_2, mod3_w, frame2_3, mod0_w, mod1_w, mod2_w, mod3_w); ADD4(mod0_w, acc0, mod1_w, acc1, mod2_w, acc2, mod3_w, acc3, mod0_w, mod1_w, mod2_w, mod3_w); ST_SW2(mod0_w, mod1_w, acc, 4); ST_SW2(mod2_w, mod3_w, acc + 8, 4); acc += 16; } }