void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest,
                            int stride) {
  const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4);
  const int16x8_t dc = vdupq_n_s16(a1);
  uint32x2_t d = vdup_n_u32(0);

  assert(!((intptr_t)dest % sizeof(uint32_t)));
  assert(!(stride % sizeof(uint32_t)));

  idct4x4_1_add_kernel(&dest, stride, dc, &d);
  idct4x4_1_add_kernel(&dest, stride, dc, &d);
}
void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest,
                              int stride) {
  int i;
  const int16_t out0 =
      WRAPLOW(dct_const_round_shift((int16_t)input[0] * cospi_16_64));
  const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64));
  const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6);

  if (a1 >= 0) {
    const uint8x16_t dc = create_dcq(a1);
    for (i = 0; i < 32; i++) {
      idct32x32_1_add_pos_kernel(&dest, stride, dc);
    }
  } else {
    const uint8x16_t dc = create_dcq(-a1);
    for (i = 0; i < 32; i++) {
      idct32x32_1_add_neg_kernel(&dest, stride, dc);
    }
  }
}