void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); uint32x2_t d = vdup_n_u32(0); assert(!((intptr_t)dest % sizeof(uint32_t))); assert(!(stride % sizeof(uint32_t))); idct4x4_1_add_kernel(&dest, stride, dc, &d); idct4x4_1_add_kernel(&dest, stride, dc, &d); }
void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { int i; const int16_t out0 = WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); if (a1 >= 0) { const uint8x16_t dc = create_dcq(a1); for (i = 0; i < 32; i++) { idct32x32_1_add_pos_kernel(&dest, stride, dc); } } else { const uint8x16_t dc = create_dcq(-a1); for (i = 0; i < 32; i++) { idct32x32_1_add_neg_kernel(&dest, stride, dc); } } }