void fdct16x8_1d_row(int16_t *input, int16_t *output) { v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7; v8i16 in8, in9, in10, in11, in12, in13, in14, in15; LD_SH8(input, 16, in0, in1, in2, in3, in4, in5, in6, in7); LD_SH8((input + 8), 16, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); ADD4(in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3); ADD4(in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7); ADD4(in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10, in11); ADD4(in12, 1, in13, 1, in14, 1, in15, 1, in12, in13, in14, in15); SRA_4V(in0, in1, in2, in3, 2); SRA_4V(in4, in5, in6, in7, 2); SRA_4V(in8, in9, in10, in11, 2); SRA_4V(in12, in13, in14, in15, 2); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, input, 16); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); LD_SH8(input, 16, in8, in9, in10, in11, in12, in13, in14, in15); FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, output, 16); TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, output + 8, 16); }
static void postproc_fdct16x8_1d_row(int16_t *intermediate, int16_t *output) { int16_t *temp = intermediate; int16_t *out = output; v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11; v8i16 in12, in13, in14, in15; LD_SH8(temp, 16, in0, in1, in2, in3, in4, in5, in6, in7); temp = intermediate + 8; LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9, in10, in11, in12, in13, in14, in15); FDCT_POSTPROC_2V_NEG_H(in0, in1); FDCT_POSTPROC_2V_NEG_H(in2, in3); FDCT_POSTPROC_2V_NEG_H(in4, in5); FDCT_POSTPROC_2V_NEG_H(in6, in7); FDCT_POSTPROC_2V_NEG_H(in8, in9); FDCT_POSTPROC_2V_NEG_H(in10, in11); FDCT_POSTPROC_2V_NEG_H(in12, in13); FDCT_POSTPROC_2V_NEG_H(in14, in15); BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14, in15); temp = intermediate; ST_SH8(in8, in9, in10, in11, in12, in13, in14, in15, temp, 16); FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); temp = intermediate; LD_SH8(temp, 16, in8, in9, in10, in11, in12, in13, in14, in15); FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3, in4, in5, in6, in7); TRANSPOSE8x8_SH_SH(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3); ST_SH8(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, out, 16); TRANSPOSE8x8_SH_SH(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7); out = output + 8; ST_SH8(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, out, 16); }