static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16, int16x4_t *d2s16, int16x8_t *q8s16, int16x8_t *q9s16) { int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16; int16x4_t d26s16, d27s16, d28s16, d29s16; int32x4_t q10s32, q13s32, q14s32, q15s32; int16x8_t q13s16, q14s16; d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, *d2s16); q10s32 = vmull_s16(d17s16, *d0s16); q13s32 = vmull_s16(d23s16, *d1s16); q14s32 = vmull_s16(d24s16, *d1s16); q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16); q10s32 = vmlal_s16(q10s32, d19s16, *d2s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q10s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); *q8s16 = vaddq_s16(q13s16, q14s16); *q9s16 = vsubq_s16(q13s16, q14s16); *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16)); // vswp return; }
int64_t av1_block_error_fp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size) { int64x2_t error = vdupq_n_s64(0); assert(block_size >= 8); assert((block_size % 8) == 0); do { const int16x8_t c = vld1q_s16(coeff); const int16x8_t d = vld1q_s16(dqcoeff); const int16x8_t diff = vsubq_s16(c, d); const int16x4_t diff_lo = vget_low_s16(diff); const int16x4_t diff_hi = vget_high_s16(diff); // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before // accumulating them in 64-bits. const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); error = vaddq_s64(error, err2); coeff += 8; dqcoeff += 8; block_size -= 8; } while (block_size != 0); return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); }
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
static inline void char_to_float_vectors(const unsigned char * sourcep, float32x4_t *mp0, float32x4_t * mp1) { uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1} */ int16x8_t widerpixels; /* rawpixels promoted to shorts per component */ int16x4_t high16, low16; int32x4_t high32, low32; const int16x8_t uvbias = {0, 128, 0, 128, 0, 128, 0, 128}; rawpixels = vld1_u8(sourcep); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* subtract uvbias from widerpixels */ widerpixels = vsubq_s16(widerpixels, uvbias); /* now take widerpixels apart into (low16, high16) and */ /* then expand those into (low32, high32) */ low16 = vget_low_s16(widerpixels); high16 = vget_high_s16(widerpixels); high32 = vmovl_s16(high16); low32 = vmovl_s16(low16); /* now convert low32 and high32 into floats and store them in */ /* *mp0, *mp1 */ *mp0 = vcvtq_f32_s32(low32); *mp1 = vcvtq_f32_s32(high32); }
void test_vsubQs16 (void) { int16x8_t out_int16x8_t; int16x8_t arg0_int16x8_t; int16x8_t arg1_int16x8_t; out_int16x8_t = vsubq_s16 (arg0_int16x8_t, arg1_int16x8_t); }
static inline int16x8_t qvsource_over_s16(int16x8_t src16, int16x8_t dst16, int16x8_t half, int16x8_t full) { const int16x4_t alpha16_high = vdup_lane_s16(vget_high_s16(src16), 3); const int16x4_t alpha16_low = vdup_lane_s16(vget_low_s16(src16), 3); const int16x8_t alpha16 = vsubq_s16(full, vcombine_s16(alpha16_low, alpha16_high)); return vaddq_s16(src16, qvbyte_mul_s16(dst16, alpha16, half)); }
void aom_hadamard_16x16_neon(const int16_t *src_diff, int src_stride, int16_t *coeff) { int i; /* Rearrange 16x16 to 8x32 and remove stride. * Top left first. */ aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); /* Top right. */ aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); /* Bottom left. */ aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); /* Bottom right. */ aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); for (i = 0; i < 64; i += 8) { const int16x8_t a0 = vld1q_s16(coeff + 0); const int16x8_t a1 = vld1q_s16(coeff + 64); const int16x8_t a2 = vld1q_s16(coeff + 128); const int16x8_t a3 = vld1q_s16(coeff + 192); const int16x8_t b0 = vhaddq_s16(a0, a1); const int16x8_t b1 = vhsubq_s16(a0, a1); const int16x8_t b2 = vhaddq_s16(a2, a3); const int16x8_t b3 = vhsubq_s16(a2, a3); const int16x8_t c0 = vaddq_s16(b0, b2); const int16x8_t c1 = vaddq_s16(b1, b3); const int16x8_t c2 = vsubq_s16(b0, b2); const int16x8_t c3 = vsubq_s16(b1, b3); vst1q_s16(coeff + 0, c0); vst1q_s16(coeff + 64, c1); vst1q_s16(coeff + 128, c2); vst1q_s16(coeff + 192, c3); coeff += 8; } }
void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) { int16x8_t zero = vdupq_n_s16(0); int16x8_t max = vdupq_n_s16(255); int16x8_t y_add = vdupq_n_s16(128); int16x8_t* y_r_buf = (int16x8_t*)y_r_buffer; int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer; int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer; int i; for (i = 0; i < 4096 / 8; i++) { int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]); y = vaddq_s16(y, y_add); int16x8_t cr = vld1q_s16((sint16*)&cr_b_buf[i]); // r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255); int16x8_t r = vaddq_s16(y, cr); r = vaddq_s16(r, vshrq_n_s16(cr, 2)); r = vaddq_s16(r, vshrq_n_s16(cr, 3)); r = vaddq_s16(r, vshrq_n_s16(cr, 5)); r = vminq_s16(vmaxq_s16(r, zero), max); vst1q_s16((sint16*)&y_r_buf[i], r); // cb = cb_g_buf[i]; int16x8_t cb = vld1q_s16((sint16*)&cb_g_buf[i]); // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2)); g = vsubq_s16(g, vshrq_n_s16(cb, 4)); g = vsubq_s16(g, vshrq_n_s16(cb, 5)); g = vsubq_s16(g, vshrq_n_s16(cr, 1)); g = vsubq_s16(g, vshrq_n_s16(cr, 3)); g = vsubq_s16(g, vshrq_n_s16(cr, 4)); g = vsubq_s16(g, vshrq_n_s16(cr, 5)); g = vminq_s16(vmaxq_s16(g, zero), max); vst1q_s16((sint16*)&cb_g_buf[i], g); // b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255); int16x8_t b = vaddq_s16(y, cb); b = vaddq_s16(b, vshrq_n_s16(cb, 1)); b = vaddq_s16(b, vshrq_n_s16(cb, 2)); b = vaddq_s16(b, vshrq_n_s16(cb, 6)); b = vminq_s16(vmaxq_s16(b, zero), max); vst1q_s16((sint16*)&cr_b_buf[i], b); } }
static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src, int16_t* dst, int len) { int i; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vld1q_s16(ref + i); const int16x8_t B = vld1q_s16(src + i); const int16x8_t C = vld1q_s16(dst + i); const int16x8_t D = vsubq_s16(A, B); // diff_uv const int16x8_t E = vaddq_s16(C, D); // new_uv vst1q_s16(dst + i, E); } for (; i < len; ++i) { const int diff_uv = ref[i] - src[i]; dst[i] += diff_uv; } }
// ref, src = [0, 510] - max diff = 16-bits // bwl = {2, 3, 4}, width = {16, 32, 64} int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { int width = 4 << bwl; int32x4_t sse = vdupq_n_s32(0); int16x8_t total = vdupq_n_s16(0); assert(width >= 8); assert((width % 8) == 0); do { const int16x8_t r = vld1q_s16(ref); const int16x8_t s = vld1q_s16(src); const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. const int16x4_t diff_lo = vget_low_s16(diff); const int16x4_t diff_hi = vget_high_s16(diff); sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. sse = vmlal_s16(sse, diff_hi, diff_hi); total = vaddq_s16(total, diff); // dynamic range 16 bits. ref += 8; src += 8; width -= 8; } while (width != 0); { // Note: 'total''s pairwise addition could be implemented similarly to // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired // with the summation of 'sse' performed better on a Cortex-A15. const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); const int32x2_t t2 = vpadd_s32(t1, t1); const int t = vget_lane_s32(t2, 0); const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), vreinterpret_s32_s64(vget_high_s64(s0))); const int s = vget_lane_s32(s1, 0); const int shift_factor = bwl + 2; return s - ((t * t) >> shift_factor); } }
/* s16x8 sub */ void mw_neon_mm_sub_s16x8(short * A, int Row, int Col, short * B, short * C) { int16x8_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 8; i <= size ; i+=8) { k = i - 8; neon_a = vld1q_s16(A + k); neon_b = vld1q_s16(B + k); neon_c = vsubq_s16(neon_a, neon_b); vst1q_s16(C + k, neon_c); } k = i - 8; for (i = 0; i < size % 8; i++) { C[k + i] = A[k + i] - B[k + i]; } }
rfx_dwt_2d_decode_block_vert_NEON(INT16 * l, INT16 * h, INT16 * dst, int subband_width) { int x, n; INT16 * l_ptr = l; INT16 * h_ptr = h; INT16 * dst_ptr = dst; int total_width = subband_width + subband_width; /* Even coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); int16x8_t l_n = vld1q_s16(l_ptr); int16x8_t h_n = vld1q_s16(h_ptr); int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));; if (n == 0) tmp_n = vaddq_s16(tmp_n, h_n); else { int16x8_t h_n_m = vld1q_s16((h_ptr - total_width)); tmp_n = vaddq_s16(tmp_n, h_n_m); } tmp_n = vshrq_n_s16(tmp_n, 1); int16x8_t dst_n = vsubq_s16(l_n, tmp_n); vst1q_s16(dst_ptr, dst_n); l_ptr+=8; h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } h_ptr = h; dst_ptr = dst + total_width; /* Odd coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); int16x8_t h_n = vld1q_s16(h_ptr); int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width); h_n = vshlq_n_s16(h_n, 1); int16x8_t tmp_n = dst_n_m; if (n == subband_width - 1) tmp_n = vaddq_s16(tmp_n, dst_n_m); else { int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width)); tmp_n = vaddq_s16(tmp_n, dst_n_p); } tmp_n = vshrq_n_s16(tmp_n, 1); int16x8_t dst_n = vaddq_s16(tmp_n, h_n); vst1q_s16(dst_ptr, dst_n); h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } }
rfx_dwt_2d_decode_block_horiz_NEON(INT16 * l, INT16 * h, INT16 * dst, int subband_width) { int y, n; INT16 * l_ptr = l; INT16 * h_ptr = h; INT16 * dst_ptr = dst; for (y = 0; y < subband_width; y++) { /* Even coefficients */ for (n = 0; n < subband_width; n+=8) { // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); int16x8_t l_n = vld1q_s16(l_ptr); int16x8_t h_n = vld1q_s16(h_ptr); int16x8_t h_n_m = vld1q_s16(h_ptr - 1); if (n == 0) { int16_t first = vgetq_lane_s16(h_n_m, 1); h_n_m = vsetq_lane_s16(first, h_n_m, 0); } int16x8_t tmp_n = vaddq_s16(h_n, h_n_m); tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1)); tmp_n = vshrq_n_s16(tmp_n, 1); int16x8_t dst_n = vsubq_s16(l_n, tmp_n); vst1q_s16(l_ptr, dst_n); l_ptr+=8; h_ptr+=8; } l_ptr -= subband_width; h_ptr -= subband_width; /* Odd coefficients */ for (n = 0; n < subband_width; n+=8) { // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); int16x8_t h_n = vld1q_s16(h_ptr); h_n = vshlq_n_s16(h_n, 1); int16x8x2_t dst_n; dst_n.val[0] = vld1q_s16(l_ptr); int16x8_t dst_n_p = vld1q_s16(l_ptr + 1); if (n == subband_width - 8) { int16_t last = vgetq_lane_s16(dst_n_p, 6); dst_n_p = vsetq_lane_s16(last, dst_n_p, 7); } dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]); dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1); dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n); vst2q_s16(dst_ptr, dst_n); l_ptr+=8; h_ptr+=8; dst_ptr+=16; } } }
void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d26u8, d27u8; uint32x2_t d26u32, d27u32; uint16x8_t q8u16, q9u16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; int16x8_t q8s16, q9s16, q13s16, q14s16; int32x4_t q1s32, q13s32, q14s32, q15s32; int16x4x2_t d0x2s16, d1x2s16; int32x4x2_t q0x2s32; uint8_t *d; d26u32 = d27u32 = vdup_n_u32(0); q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); d18s16 = vget_low_s16(q9s16); d19s16 = vget_high_s16(q9s16); d0x2s16 = vtrn_s16(d16s16, d17s16); d1x2s16 = vtrn_s16(d18s16, d19s16); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); d20s16 = vdup_n_s16((int16_t)cospi_8_64); d21s16 = vdup_n_s16((int16_t)cospi_16_64); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d22s16 = vdup_n_s16((int16_t)cospi_24_64); // stage 1 d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, d22s16); q1s32 = vmull_s16(d17s16, d20s16); q13s32 = vmull_s16(d23s16, d21s16); q14s32 = vmull_s16(d24s16, d21s16); q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); q1s32 = vmlal_s16(q1s32, d19s16, d22s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q1s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); // stage 2 q8s16 = vaddq_s16(q13s16, q14s16); q9s16 = vsubq_s16(q13s16, q14s16); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); d18s16 = vget_high_s16(q9s16); // vswp d18 d19 d19s16 = vget_low_s16(q9s16); d0x2s16 = vtrn_s16(d16s16, d17s16); d1x2s16 = vtrn_s16(d18s16, d19s16); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); // do the transform on columns // stage 1 d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, d22s16); q1s32 = vmull_s16(d17s16, d20s16); q13s32 = vmull_s16(d23s16, d21s16); q14s32 = vmull_s16(d24s16, d21s16); q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); q1s32 = vmlal_s16(q1s32, d19s16, d22s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q1s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); // stage 2 q8s16 = vaddq_s16(q13s16, q14s16); q9s16 = vsubq_s16(q13s16, q14s16); q8s16 = vrshrq_n_s16(q8s16, 4); q9s16 = vrshrq_n_s16(q9s16, 4); d = dest; d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); d += dest_stride; d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); d += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); d += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d = dest; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); return; }
PRIM_STATIC pstatus_t neon_yCbCrToRGB_16s16s_P3P3( const INT16 *pSrc[3], int srcStep, INT16 *pDst[3], int dstStep, const prim_size_t *roi) /* region of interest */ { /* TODO: If necessary, check alignments and call the general version. */ int16x8_t zero = vdupq_n_s16(0); int16x8_t max = vdupq_n_s16(255); int16x8_t y_add = vdupq_n_s16(128); int16x8_t* y_buf = (int16x8_t*) pSrc[0]; int16x8_t* cb_buf = (int16x8_t*) pSrc[1]; int16x8_t* cr_buf = (int16x8_t*) pSrc[2]; int16x8_t* r_buf = (int16x8_t*) pDst[0]; int16x8_t* g_buf = (int16x8_t*) pDst[1]; int16x8_t* b_buf = (int16x8_t*) pDst[2]; int srcbump = srcStep / sizeof(int16x8_t); int dstbump = dstStep / sizeof(int16x8_t); int yp; int imax = roi->width * sizeof(INT16) / sizeof(int16x8_t); for (yp=0; yp<roi->height; ++yp) { int i; for (i=0; i<imax; i++) { int16x8_t y = vld1q_s16((INT16*) (y_buf+i)); y = vaddq_s16(y, y_add); int16x8_t cr = vld1q_s16((INT16*) (cr_buf+i)); /* r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), * 0, 255); */ int16x8_t r = vaddq_s16(y, cr); r = vaddq_s16(r, vshrq_n_s16(cr, 2)); r = vaddq_s16(r, vshrq_n_s16(cr, 3)); r = vaddq_s16(r, vshrq_n_s16(cr, 5)); r = vminq_s16(vmaxq_s16(r, zero), max); vst1q_s16((INT16*) (r_buf+i), r); /* cb = cb_g_buf[i]; */ int16x8_t cb = vld1q_s16((INT16*) (cb_buf+i)); /* g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) * - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); */ int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2)); g = vsubq_s16(g, vshrq_n_s16(cb, 4)); g = vsubq_s16(g, vshrq_n_s16(cb, 5)); g = vsubq_s16(g, vshrq_n_s16(cr, 1)); g = vsubq_s16(g, vshrq_n_s16(cr, 3)); g = vsubq_s16(g, vshrq_n_s16(cr, 4)); g = vsubq_s16(g, vshrq_n_s16(cr, 5)); g = vminq_s16(vmaxq_s16(g, zero), max); vst1q_s16((INT16*) (g_buf+i), g); /* b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), * 0, 255); */ int16x8_t b = vaddq_s16(y, cb); b = vaddq_s16(b, vshrq_n_s16(cb, 1)); b = vaddq_s16(b, vshrq_n_s16(cb, 2)); b = vaddq_s16(b, vshrq_n_s16(cb, 6)); b = vminq_s16(vmaxq_s16(b, zero), max); vst1q_s16((INT16*) (b_buf+i), b); } y_buf += srcbump; cb_buf += srcbump; cr_buf += srcbump; r_buf += dstbump; g_buf += dstbump; b_buf += dstbump; } }
// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 34 non-zero // coefficients as follows: // 0 1 2 3 4 5 6 7 // 0 0 2 5 10 17 25 // 1 1 4 8 15 22 30 // 2 3 7 12 18 28 // 3 6 11 16 23 31 // 4 9 14 19 29 // 5 13 20 26 // 6 21 27 33 // 7 24 32 void vpx_idct32_6_neon(const tran_low_t *input, int16_t *output) { int16x8_t in[8], s1[32], s2[32], s3[32]; in[0] = load_tran_low_to_s16q(input); input += 32; in[1] = load_tran_low_to_s16q(input); input += 32; in[2] = load_tran_low_to_s16q(input); input += 32; in[3] = load_tran_low_to_s16q(input); input += 32; in[4] = load_tran_low_to_s16q(input); input += 32; in[5] = load_tran_low_to_s16q(input); input += 32; in[6] = load_tran_low_to_s16q(input); input += 32; in[7] = load_tran_low_to_s16q(input); transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], &in[7]); // stage 1 // input[1] * cospi_31_64 - input[31] * cospi_1_64 (but input[31] == 0) s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); // input[1] * cospi_1_64 + input[31] * cospi_31_64 (but input[31] == 0) s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); // stage 2 s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); // stage 3 s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], cospi_28_64); s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], cospi_4_64); s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64, s1[27], cospi_12_64); s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27], cospi_20_64); s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, s1[24], -cospi_20_64); s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, s1[24], cospi_12_64); // stage 4 s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], cospi_24_64); s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], cospi_8_64); s2[20] = vsubq_s16(s1[23], s1[20]); s2[21] = vsubq_s16(s1[22], s1[21]); s2[22] = vaddq_s16(s1[21], s1[22]); s2[23] = vaddq_s16(s1[20], s1[23]); s2[24] = vaddq_s16(s1[24], s1[27]); s2[25] = vaddq_s16(s1[25], s1[26]); s2[26] = vsubq_s16(s1[25], s1[26]); s2[27] = vsubq_s16(s1[24], s1[27]); // stage 5 s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64); s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64); s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[17], -cospi_8_64, s1[30], cospi_24_64); s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[17], cospi_24_64, s1[30], cospi_8_64); s1[19] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_8_64, s1[31], cospi_24_64); s1[28] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_24_64, s1[31], cospi_8_64); s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64, s2[27], -cospi_8_64); s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27], cospi_24_64); s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64, s2[26], -cospi_8_64); s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26], cospi_24_64); // stage 6 s2[0] = vaddq_s16(s1[0], s1[7]); s2[1] = vaddq_s16(s1[0], s1[6]); s2[2] = vaddq_s16(s1[0], s1[5]); s2[3] = vaddq_s16(s1[0], s1[4]); s2[4] = vsubq_s16(s1[0], s1[4]); s2[5] = vsubq_s16(s1[0], s1[5]); s2[6] = vsubq_s16(s1[0], s1[6]); s2[7] = vsubq_s16(s1[0], s1[7]); s2[10] = sub_multiply_shift_and_narrow_s16(s2[14], s2[9], cospi_16_64); s2[13] = add_multiply_shift_and_narrow_s16(s2[9], s2[14], cospi_16_64); s2[11] = sub_multiply_shift_and_narrow_s16(s2[15], s2[8], cospi_16_64); s2[12] = add_multiply_shift_and_narrow_s16(s2[8], s2[15], cospi_16_64); s2[16] = vaddq_s16(s1[16], s2[23]); s2[17] = vaddq_s16(s1[17], s2[22]); s2[18] = vaddq_s16(s1[18], s1[21]); s2[19] = vaddq_s16(s1[19], s1[20]); s2[20] = vsubq_s16(s1[19], s1[20]); s2[21] = vsubq_s16(s1[18], s1[21]); s2[22] = vsubq_s16(s1[17], s2[22]); s2[23] = vsubq_s16(s1[16], s2[23]); s3[24] = vsubq_s16(s1[31], s2[24]); s3[25] = vsubq_s16(s1[30], s2[25]); s3[26] = vsubq_s16(s1[29], s1[26]); s3[27] = vsubq_s16(s1[28], s1[27]); s2[28] = vaddq_s16(s1[27], s1[28]); s2[29] = vaddq_s16(s1[26], s1[29]); s2[30] = vaddq_s16(s2[25], s1[30]); s2[31] = vaddq_s16(s2[24], s1[31]); // stage 7 s1[0] = vaddq_s16(s2[0], s2[15]); s1[1] = vaddq_s16(s2[1], s2[14]); s1[2] = vaddq_s16(s2[2], s2[13]); s1[3] = vaddq_s16(s2[3], s2[12]); s1[4] = vaddq_s16(s2[4], s2[11]); s1[5] = vaddq_s16(s2[5], s2[10]); s1[6] = vaddq_s16(s2[6], s2[9]); s1[7] = vaddq_s16(s2[7], s2[8]); s1[8] = vsubq_s16(s2[7], s2[8]); s1[9] = vsubq_s16(s2[6], s2[9]); s1[10] = vsubq_s16(s2[5], s2[10]); s1[11] = vsubq_s16(s2[4], s2[11]); s1[12] = vsubq_s16(s2[3], s2[12]); s1[13] = vsubq_s16(s2[2], s2[13]); s1[14] = vsubq_s16(s2[1], s2[14]); s1[15] = vsubq_s16(s2[0], s2[15]); s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64); s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64); s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64); s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64); s1[22] = sub_multiply_shift_and_narrow_s16(s3[25], s2[22], cospi_16_64); s1[25] = add_multiply_shift_and_narrow_s16(s2[22], s3[25], cospi_16_64); s1[23] = sub_multiply_shift_and_narrow_s16(s3[24], s2[23], cospi_16_64); s1[24] = add_multiply_shift_and_narrow_s16(s2[23], s3[24], cospi_16_64); // final stage vst1q_s16(output, vaddq_s16(s1[0], s2[31])); output += 8; vst1q_s16(output, vaddq_s16(s1[1], s2[30])); output += 8; vst1q_s16(output, vaddq_s16(s1[2], s2[29])); output += 8; vst1q_s16(output, vaddq_s16(s1[3], s2[28])); output += 8; vst1q_s16(output, vaddq_s16(s1[4], s1[27])); output += 8; vst1q_s16(output, vaddq_s16(s1[5], s1[26])); output += 8; vst1q_s16(output, vaddq_s16(s1[6], s1[25])); output += 8; vst1q_s16(output, vaddq_s16(s1[7], s1[24])); output += 8; vst1q_s16(output, vaddq_s16(s1[8], s1[23])); output += 8; vst1q_s16(output, vaddq_s16(s1[9], s1[22])); output += 8; vst1q_s16(output, vaddq_s16(s1[10], s1[21])); output += 8; vst1q_s16(output, vaddq_s16(s1[11], s1[20])); output += 8; vst1q_s16(output, vaddq_s16(s1[12], s2[19])); output += 8; vst1q_s16(output, vaddq_s16(s1[13], s2[18])); output += 8; vst1q_s16(output, vaddq_s16(s1[14], s2[17])); output += 8; vst1q_s16(output, vaddq_s16(s1[15], s2[16])); output += 8; vst1q_s16(output, vsubq_s16(s1[15], s2[16])); output += 8; vst1q_s16(output, vsubq_s16(s1[14], s2[17])); output += 8; vst1q_s16(output, vsubq_s16(s1[13], s2[18])); output += 8; vst1q_s16(output, vsubq_s16(s1[12], s2[19])); output += 8; vst1q_s16(output, vsubq_s16(s1[11], s1[20])); output += 8; vst1q_s16(output, vsubq_s16(s1[10], s1[21])); output += 8; vst1q_s16(output, vsubq_s16(s1[9], s1[22])); output += 8; vst1q_s16(output, vsubq_s16(s1[8], s1[23])); output += 8; vst1q_s16(output, vsubq_s16(s1[7], s1[24])); output += 8; vst1q_s16(output, vsubq_s16(s1[6], s1[25])); output += 8; vst1q_s16(output, vsubq_s16(s1[5], s1[26])); output += 8; vst1q_s16(output, vsubq_s16(s1[4], s1[27])); output += 8; vst1q_s16(output, vsubq_s16(s1[3], s2[28])); output += 8; vst1q_s16(output, vsubq_s16(s1[2], s2[29])); output += 8; vst1q_s16(output, vsubq_s16(s1[1], s2[30])); output += 8; vst1q_s16(output, vsubq_s16(s1[0], s2[31])); }
void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; (void)scan; if (!skip_block) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. int i; const int16x8_t v_zero = vdupq_n_s16(0); const int16x8_t v_one = vdupq_n_s16(1); int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); int16x8_t v_round = vmovq_n_s16(round_ptr[1]); int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); // adjust for dc v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); // process dc and the first seven ac coeffs { const int16x8_t v_iscan = vld1q_s16(&iscan[0]); const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); v_round = vmovq_n_s16(round_ptr[1]); v_quant = vmovq_n_s16(quant_ptr[1]); v_dequant = vmovq_n_s16(dequant_ptr[1]); } // now process the rest of the ac coeffs for (i = 8; i < count; i += 8) { const int16x8_t v_iscan = vld1q_s16(&iscan[i]); const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); } { const int16x4_t v_eobmax_3210 = vmax_s16( vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); const int64x1_t v_eobmax_xx32 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); const int16x4_t v_eobmax_tmp = vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); const int64x1_t v_eobmax_xxx3 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); const int16x4_t v_eobmax_final = vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); } } else { memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); *eob_ptr = 0; } }
inline int16x8_t vsubq(const int16x8_t & v0, const int16x8_t & v1) { return vsubq_s16(v0, v1); }
void vpx_idct32_8_neon(const int16_t *input, void *const output, int stride, const int highbd_flag) { int16x8_t in[8], s1[32], s2[32], s3[32], out[32]; load_and_transpose_s16_8x8(input, 8, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], &in[7]); // stage 1 s1[16] = multiply_shift_and_narrow_s16(in[1], cospi_31_64); s1[31] = multiply_shift_and_narrow_s16(in[1], cospi_1_64); // Different for _8_ s1[19] = multiply_shift_and_narrow_s16(in[7], -cospi_25_64); s1[28] = multiply_shift_and_narrow_s16(in[7], cospi_7_64); s1[20] = multiply_shift_and_narrow_s16(in[5], cospi_27_64); s1[27] = multiply_shift_and_narrow_s16(in[5], cospi_5_64); s1[23] = multiply_shift_and_narrow_s16(in[3], -cospi_29_64); s1[24] = multiply_shift_and_narrow_s16(in[3], cospi_3_64); // stage 2 s2[8] = multiply_shift_and_narrow_s16(in[2], cospi_30_64); s2[15] = multiply_shift_and_narrow_s16(in[2], cospi_2_64); s2[11] = multiply_shift_and_narrow_s16(in[6], -cospi_26_64); s2[12] = multiply_shift_and_narrow_s16(in[6], cospi_6_64); // stage 3 s1[4] = multiply_shift_and_narrow_s16(in[4], cospi_28_64); s1[7] = multiply_shift_and_narrow_s16(in[4], cospi_4_64); s1[17] = multiply_accumulate_shift_and_narrow_s16(s1[16], -cospi_4_64, s1[31], cospi_28_64); s1[30] = multiply_accumulate_shift_and_narrow_s16(s1[16], cospi_28_64, s1[31], cospi_4_64); // Different for _8_ s1[18] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_28_64, s1[28], -cospi_4_64); s1[29] = multiply_accumulate_shift_and_narrow_s16(s1[19], -cospi_4_64, s1[28], cospi_28_64); s1[21] = multiply_accumulate_shift_and_narrow_s16(s1[20], -cospi_20_64, s1[27], cospi_12_64); s1[26] = multiply_accumulate_shift_and_narrow_s16(s1[20], cospi_12_64, s1[27], cospi_20_64); s1[22] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_12_64, s1[24], -cospi_20_64); s1[25] = multiply_accumulate_shift_and_narrow_s16(s1[23], -cospi_20_64, s1[24], cospi_12_64); // stage 4 s1[0] = multiply_shift_and_narrow_s16(in[0], cospi_16_64); s2[9] = multiply_accumulate_shift_and_narrow_s16(s2[8], -cospi_8_64, s2[15], cospi_24_64); s2[14] = multiply_accumulate_shift_and_narrow_s16(s2[8], cospi_24_64, s2[15], cospi_8_64); s2[10] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_24_64, s2[12], -cospi_8_64); s2[13] = multiply_accumulate_shift_and_narrow_s16(s2[11], -cospi_8_64, s2[12], cospi_24_64); s2[16] = vaddq_s16(s1[16], s1[19]); s2[17] = vaddq_s16(s1[17], s1[18]); s2[18] = vsubq_s16(s1[17], s1[18]); s2[19] = vsubq_s16(s1[16], s1[19]); s2[20] = vsubq_s16(s1[23], s1[20]); s2[21] = vsubq_s16(s1[22], s1[21]); s2[22] = vaddq_s16(s1[21], s1[22]); s2[23] = vaddq_s16(s1[20], s1[23]); s2[24] = vaddq_s16(s1[24], s1[27]); s2[25] = vaddq_s16(s1[25], s1[26]); s2[26] = vsubq_s16(s1[25], s1[26]); s2[27] = vsubq_s16(s1[24], s1[27]); s2[28] = vsubq_s16(s1[31], s1[28]); s2[29] = vsubq_s16(s1[30], s1[29]); s2[30] = vaddq_s16(s1[29], s1[30]); s2[31] = vaddq_s16(s1[28], s1[31]); // stage 5 s1[5] = sub_multiply_shift_and_narrow_s16(s1[7], s1[4], cospi_16_64); s1[6] = add_multiply_shift_and_narrow_s16(s1[4], s1[7], cospi_16_64); s1[8] = vaddq_s16(s2[8], s2[11]); s1[9] = vaddq_s16(s2[9], s2[10]); s1[10] = vsubq_s16(s2[9], s2[10]); s1[11] = vsubq_s16(s2[8], s2[11]); s1[12] = vsubq_s16(s2[15], s2[12]); s1[13] = vsubq_s16(s2[14], s2[13]); s1[14] = vaddq_s16(s2[13], s2[14]); s1[15] = vaddq_s16(s2[12], s2[15]); s1[18] = multiply_accumulate_shift_and_narrow_s16(s2[18], -cospi_8_64, s2[29], cospi_24_64); s1[29] = multiply_accumulate_shift_and_narrow_s16(s2[18], cospi_24_64, s2[29], cospi_8_64); s1[19] = multiply_accumulate_shift_and_narrow_s16(s2[19], -cospi_8_64, s2[28], cospi_24_64); s1[28] = multiply_accumulate_shift_and_narrow_s16(s2[19], cospi_24_64, s2[28], cospi_8_64); s1[20] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_24_64, s2[27], -cospi_8_64); s1[27] = multiply_accumulate_shift_and_narrow_s16(s2[20], -cospi_8_64, s2[27], cospi_24_64); s1[21] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_24_64, s2[26], -cospi_8_64); s1[26] = multiply_accumulate_shift_and_narrow_s16(s2[21], -cospi_8_64, s2[26], cospi_24_64); // stage 6 s2[0] = vaddq_s16(s1[0], s1[7]); s2[1] = vaddq_s16(s1[0], s1[6]); s2[2] = vaddq_s16(s1[0], s1[5]); s2[3] = vaddq_s16(s1[0], s1[4]); s2[4] = vsubq_s16(s1[0], s1[4]); s2[5] = vsubq_s16(s1[0], s1[5]); s2[6] = vsubq_s16(s1[0], s1[6]); s2[7] = vsubq_s16(s1[0], s1[7]); s2[10] = sub_multiply_shift_and_narrow_s16(s1[13], s1[10], cospi_16_64); s2[13] = add_multiply_shift_and_narrow_s16(s1[10], s1[13], cospi_16_64); s2[11] = sub_multiply_shift_and_narrow_s16(s1[12], s1[11], cospi_16_64); s2[12] = add_multiply_shift_and_narrow_s16(s1[11], s1[12], cospi_16_64); s1[16] = vaddq_s16(s2[16], s2[23]); s1[17] = vaddq_s16(s2[17], s2[22]); s2[18] = vaddq_s16(s1[18], s1[21]); s2[19] = vaddq_s16(s1[19], s1[20]); s2[20] = vsubq_s16(s1[19], s1[20]); s2[21] = vsubq_s16(s1[18], s1[21]); s1[22] = vsubq_s16(s2[17], s2[22]); s1[23] = vsubq_s16(s2[16], s2[23]); s3[24] = vsubq_s16(s2[31], s2[24]); s3[25] = vsubq_s16(s2[30], s2[25]); s3[26] = vsubq_s16(s1[29], s1[26]); s3[27] = vsubq_s16(s1[28], s1[27]); s2[28] = vaddq_s16(s1[27], s1[28]); s2[29] = vaddq_s16(s1[26], s1[29]); s2[30] = vaddq_s16(s2[25], s2[30]); s2[31] = vaddq_s16(s2[24], s2[31]); // stage 7 s1[0] = vaddq_s16(s2[0], s1[15]); s1[1] = vaddq_s16(s2[1], s1[14]); s1[2] = vaddq_s16(s2[2], s2[13]); s1[3] = vaddq_s16(s2[3], s2[12]); s1[4] = vaddq_s16(s2[4], s2[11]); s1[5] = vaddq_s16(s2[5], s2[10]); s1[6] = vaddq_s16(s2[6], s1[9]); s1[7] = vaddq_s16(s2[7], s1[8]); s1[8] = vsubq_s16(s2[7], s1[8]); s1[9] = vsubq_s16(s2[6], s1[9]); s1[10] = vsubq_s16(s2[5], s2[10]); s1[11] = vsubq_s16(s2[4], s2[11]); s1[12] = vsubq_s16(s2[3], s2[12]); s1[13] = vsubq_s16(s2[2], s2[13]); s1[14] = vsubq_s16(s2[1], s1[14]); s1[15] = vsubq_s16(s2[0], s1[15]); s1[20] = sub_multiply_shift_and_narrow_s16(s3[27], s2[20], cospi_16_64); s1[27] = add_multiply_shift_and_narrow_s16(s2[20], s3[27], cospi_16_64); s1[21] = sub_multiply_shift_and_narrow_s16(s3[26], s2[21], cospi_16_64); s1[26] = add_multiply_shift_and_narrow_s16(s2[21], s3[26], cospi_16_64); s2[22] = sub_multiply_shift_and_narrow_s16(s3[25], s1[22], cospi_16_64); s1[25] = add_multiply_shift_and_narrow_s16(s1[22], s3[25], cospi_16_64); s2[23] = sub_multiply_shift_and_narrow_s16(s3[24], s1[23], cospi_16_64); s1[24] = add_multiply_shift_and_narrow_s16(s1[23], s3[24], cospi_16_64); // final stage out[0] = final_add(s1[0], s2[31]); out[1] = final_add(s1[1], s2[30]); out[2] = final_add(s1[2], s2[29]); out[3] = final_add(s1[3], s2[28]); out[4] = final_add(s1[4], s1[27]); out[5] = final_add(s1[5], s1[26]); out[6] = final_add(s1[6], s1[25]); out[7] = final_add(s1[7], s1[24]); out[8] = final_add(s1[8], s2[23]); out[9] = final_add(s1[9], s2[22]); out[10] = final_add(s1[10], s1[21]); out[11] = final_add(s1[11], s1[20]); out[12] = final_add(s1[12], s2[19]); out[13] = final_add(s1[13], s2[18]); out[14] = final_add(s1[14], s1[17]); out[15] = final_add(s1[15], s1[16]); out[16] = final_sub(s1[15], s1[16]); out[17] = final_sub(s1[14], s1[17]); out[18] = final_sub(s1[13], s2[18]); out[19] = final_sub(s1[12], s2[19]); out[20] = final_sub(s1[11], s1[20]); out[21] = final_sub(s1[10], s1[21]); out[22] = final_sub(s1[9], s2[22]); out[23] = final_sub(s1[8], s2[23]); out[24] = final_sub(s1[7], s1[24]); out[25] = final_sub(s1[6], s1[25]); out[26] = final_sub(s1[5], s1[26]); out[27] = final_sub(s1[4], s1[27]); out[28] = final_sub(s1[3], s2[28]); out[29] = final_sub(s1[2], s2[29]); out[30] = final_sub(s1[1], s2[30]); out[31] = final_sub(s1[0], s2[31]); if (highbd_flag) { highbd_add_and_store_bd8(out, output, stride); } else { uint8_t *const outputT = (uint8_t *)output; add_and_store_u8_s16(out[0], out[1], out[2], out[3], out[4], out[5], out[6], out[7], outputT, stride); add_and_store_u8_s16(out[8], out[9], out[10], out[11], out[12], out[13], out[14], out[15], outputT + (8 * stride), stride); add_and_store_u8_s16(out[16], out[17], out[18], out[19], out[20], out[21], out[22], out[23], outputT + (16 * stride), stride); add_and_store_u8_s16(out[24], out[25], out[26], out[27], out[28], out[29], out[30], out[31], outputT + (24 * stride), stride); } }
void vp8_short_fdct8x4_neon( int16_t *input, int16_t *output, int pitch) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16; uint16x4_t d28u16, d29u16; uint16x8_t q14u16; int16x8_t q0s16, q1s16, q2s16, q3s16; int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16; int32x4_t q9s32, q10s32, q11s32, q12s32; int16x8x2_t v2tmp0, v2tmp1; int32x4x2_t v2tmp2, v2tmp3; d16s16 = vdup_n_s16(5352); d17s16 = vdup_n_s16(2217); q9s32 = vdupq_n_s32(14500); q10s32 = vdupq_n_s32(7500); // Part one pitch >>= 1; q0s16 = vld1q_s16(input); input += pitch; q1s16 = vld1q_s16(input); input += pitch; q2s16 = vld1q_s16(input); input += pitch; q3s16 = vld1q_s16(input); v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16)); v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16)); v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); q11s16 = vshlq_n_s16(q11s16, 3); q12s16 = vshlq_n_s16(q12s16, 3); q13s16 = vshlq_n_s16(q13s16, 3); q14s16 = vshlq_n_s16(q14s16, 3); q0s16 = vaddq_s16(q11s16, q12s16); q2s16 = vsubq_s16(q11s16, q12s16); q11s32 = q9s32; q12s32 = q10s32; d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); q9s32 = vmlal_s16(q9s32, d28s16, d16s16); q10s32 = vmlal_s16(q10s32, d28s16, d17s16); q11s32 = vmlal_s16(q11s32, d29s16, d16s16); q12s32 = vmlal_s16(q12s32, d29s16, d17s16); q9s32 = vmlal_s16(q9s32, d26s16, d17s16); q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); q11s32 = vmlal_s16(q11s32, d27s16, d17s16); q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); d2s16 = vshrn_n_s32(q9s32, 12); d6s16 = vshrn_n_s32(q10s32, 12); d3s16 = vshrn_n_s32(q11s32, 12); d7s16 = vshrn_n_s32(q12s32, 12); q1s16 = vcombine_s16(d2s16, d3s16); q3s16 = vcombine_s16(d6s16, d7s16); // Part two q9s32 = vdupq_n_s32(12000); q10s32 = vdupq_n_s32(51000); v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16)); v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16)); v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); q15s16 = vdupq_n_s16(7); q11s16 = vaddq_s16(q11s16, q15s16); q0s16 = vaddq_s16(q11s16, q12s16); q1s16 = vsubq_s16(q11s16, q12s16); q11s32 = q9s32; q12s32 = q10s32; d0s16 = vget_low_s16(q0s16); d1s16 = vget_high_s16(q0s16); d2s16 = vget_low_s16(q1s16); d3s16 = vget_high_s16(q1s16); d0s16 = vshr_n_s16(d0s16, 4); d4s16 = vshr_n_s16(d1s16, 4); d2s16 = vshr_n_s16(d2s16, 4); d6s16 = vshr_n_s16(d3s16, 4); d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); q9s32 = vmlal_s16(q9s32, d28s16, d16s16); q10s32 = vmlal_s16(q10s32, d28s16, d17s16); q11s32 = vmlal_s16(q11s32, d29s16, d16s16); q12s32 = vmlal_s16(q12s32, d29s16, d17s16); q9s32 = vmlal_s16(q9s32, d26s16, d17s16); q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); q11s32 = vmlal_s16(q11s32, d27s16, d17s16); q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); d1s16 = vshrn_n_s32(q9s32, 16); d3s16 = vshrn_n_s32(q10s32, 16); d5s16 = vshrn_n_s32(q11s32, 16); d7s16 = vshrn_n_s32(q12s32, 16); qEmptys16 = vdupq_n_s16(0); q14u16 = vceqq_s16(q14s16, qEmptys16); q14u16 = vmvnq_u16(q14u16); d28u16 = vget_low_u16(q14u16); d29u16 = vget_high_u16(q14u16); d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16)); d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16)); q0s16 = vcombine_s16(d0s16, d1s16); q1s16 = vcombine_s16(d2s16, d3s16); q2s16 = vcombine_s16(d4s16, d5s16); q3s16 = vcombine_s16(d6s16, d7s16); vst1q_s16(output, q0s16); vst1q_s16(output + 8, q1s16); vst1q_s16(output + 16, q2s16); vst1q_s16(output + 24, q3s16); return; }
void vpx_fdct4x4_neon(const int16_t *input, tran_low_t *final_output, int stride) { int i; // input[M * stride] * 16 int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4); int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4); int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4); int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4); // If the very first value != 0, then add 1. if (input[0] != 0) { const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1)); input_0 = vadd_s16(input_0, one); } for (i = 0; i < 2; ++i) { const int16x8_t input_01 = vcombine_s16(input_0, input_1); const int16x8_t input_32 = vcombine_s16(input_3, input_2); // in_0 +/- in_3, in_1 +/- in_2 const int16x8_t s_01 = vaddq_s16(input_01, input_32); const int16x8_t s_32 = vsubq_s16(input_01, input_32); // step_0 +/- step_1, step_2 +/- step_3 const int16x4_t s_0 = vget_low_s16(s_01); const int16x4_t s_1 = vget_high_s16(s_01); const int16x4_t s_2 = vget_high_s16(s_32); const int16x4_t s_3 = vget_low_s16(s_32); // (s_0 +/- s_1) * cospi_16_64 // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c. const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1); const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1); const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64); const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64); // fdct_round_shift int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS); int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS); // s_3 * cospi_8_64 + s_2 * cospi_24_64 // s_3 * cospi_24_64 - s_2 * cospi_8_64 const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64); const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64); const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64); const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64); // fdct_round_shift int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS); int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS); transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3); input_0 = out_0; input_1 = out_1; input_2 = out_2; input_3 = out_3; } { // Not quite a rounding shift. Only add 1 despite shifting by 2. const int16x8_t one = vdupq_n_s16(1); int16x8_t out_01 = vcombine_s16(input_0, input_1); int16x8_t out_23 = vcombine_s16(input_2, input_3); out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2); out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2); store_s16q_to_tran_low(final_output + 0 * 8, out_01); store_s16q_to_tran_low(final_output + 1 * 8, out_23); } }
void vpx_idct8x8_12_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; int16x4_t d26s16, d27s16, d28s16, d29s16; uint64x1_t d0u64, d1u64, d2u64, d3u64; int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; int32x4_t q9s32, q10s32, q11s32, q12s32; q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); q10s16 = vld1q_s16(input + 16); q11s16 = vld1q_s16(input + 24); q12s16 = vld1q_s16(input + 32); q13s16 = vld1q_s16(input + 40); q14s16 = vld1q_s16(input + 48); q15s16 = vld1q_s16(input + 56); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // First transform rows // stage 1 q0s16 = vdupq_n_s16(cospi_28_64 * 2); q1s16 = vdupq_n_s16(cospi_4_64 * 2); q4s16 = vqrdmulhq_s16(q9s16, q0s16); q0s16 = vdupq_n_s16(-cospi_20_64 * 2); q7s16 = vqrdmulhq_s16(q9s16, q1s16); q1s16 = vdupq_n_s16(cospi_12_64 * 2); q5s16 = vqrdmulhq_s16(q11s16, q0s16); q0s16 = vdupq_n_s16(cospi_16_64 * 2); q6s16 = vqrdmulhq_s16(q11s16, q1s16); // stage 2 & stage 3 - even half q1s16 = vdupq_n_s16(cospi_24_64 * 2); q9s16 = vqrdmulhq_s16(q8s16, q0s16); q0s16 = vdupq_n_s16(cospi_8_64 * 2); q13s16 = vqrdmulhq_s16(q10s16, q1s16); q15s16 = vqrdmulhq_s16(q10s16, q0s16); // stage 3 -odd half q0s16 = vaddq_s16(q9s16, q15s16); q1s16 = vaddq_s16(q9s16, q13s16); q2s16 = vsubq_s16(q9s16, q13s16); q3s16 = vsubq_s16(q9s16, q15s16); // stage 2 - odd half q13s16 = vsubq_s16(q4s16, q5s16); q4s16 = vaddq_s16(q4s16, q5s16); q14s16 = vsubq_s16(q7s16, q6s16); q7s16 = vaddq_s16(q7s16, q6s16); d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); d16s16 = vdup_n_s16(cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); q11s32 = vmull_s16(d28s16, d16s16); q12s32 = vmull_s16(d29s16, d16s16); q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); q11s32 = vmlal_s16(q11s32, d26s16, d16s16); q12s32 = vmlal_s16(q12s32, d27s16, d16s16); d10s16 = vqrshrn_n_s32(q9s32, 14); d11s16 = vqrshrn_n_s32(q10s32, 14); d12s16 = vqrshrn_n_s32(q11s32, 14); d13s16 = vqrshrn_n_s32(q12s32, 14); q5s16 = vcombine_s16(d10s16, d11s16); q6s16 = vcombine_s16(d12s16, d13s16); // stage 4 q8s16 = vaddq_s16(q0s16, q7s16); q9s16 = vaddq_s16(q1s16, q6s16); q10s16 = vaddq_s16(q2s16, q5s16); q11s16 = vaddq_s16(q3s16, q4s16); q12s16 = vsubq_s16(q3s16, q4s16); q13s16 = vsubq_s16(q2s16, q5s16); q14s16 = vsubq_s16(q1s16, q6s16); q15s16 = vsubq_s16(q0s16, q7s16); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); q8s16 = vrshrq_n_s16(q8s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5); q10s16 = vrshrq_n_s16(q10s16, 5); q11s16 = vrshrq_n_s16(q11s16, 5); q12s16 = vrshrq_n_s16(q12s16, 5); q13s16 = vrshrq_n_s16(q13s16, 5); q14s16 = vrshrq_n_s16(q14s16, 5); q15s16 = vrshrq_n_s16(q15s16, 5); d1 = d2 = dest; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; q8s16 = q12s16; q9s16 = q13s16; q10s16 = q14s16; q11s16 = q15s16; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; return; }
void yuv422rgb_neon_int(const unsigned char * sourcep, int source_byte_count, unsigned char * destp) { const unsigned char *source_endp; const unsigned char *vector_endp; int remainder; const int16x8_t u_coeff = {0, -22, 113, 0, 0, -22, 113, 0}; const int16x8_t v_coeff = {90, -46, 0, 0, 90, -46, 0, 0}; const uint8x8_t zeroalpha = {0x0, 0x0, 0x0, 0xFF, 0x0, 0x0, 0x0, 0xFF}; const int16x8_t uvbias = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; int16x8_t mp0_rgba; /* macropixel 0's resulting RGBA RGBA pixels */ int16x8_t mp1_rgba; /* macropixel 1's resulting RGBA RGBA pixels */ uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1} */ uint8x8_t rgba0, rgba1; /* rgba values as bytes */ uint8x16_t bothrgba; uint8_t * destinationp; /* pointer into output buffer destp */ int16x8_t widerpixels; /* rawpixels promoted to shorts per component */ const uint8x8_t yselect = {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; /* we're working with things in 4-byte macropixels */ remainder = source_byte_count % 4; source_endp = sourcep + source_byte_count; vector_endp = source_endp - remainder; destinationp = (uint8_t *)destp; while (sourcep < vector_endp) { /* pull YUYV from 2 four byte macropixels starting at sourcep. */ /* we'll increment sourcep as we go to save the array dereference */ /* and separate increment instruction at the end of the loop */ /* load rawpixels with {[YUYV]0 [YUYV]1 } with byte components */ rawpixels = vld1_u8(sourcep); sourcep += sizeof(rawpixels); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* ---------- process macropixel 0 --------------- */ /* take macropixel zero ([YUYV]0) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp0_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]0 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP0_Y0); y1_vec = vdup_lane_u8(rawpixels, MP0_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); /* store ALPHA in elements 3 and 7 (after the RGB components) */ narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ /* change interpretation from unsigned 16-bit to signed */ /* 16-bit. */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_low_s16(widerpixels); /* form a vector of the U component from MP0 */ u_vec = vdupq_lane_s16(yuyv, MP0_U); /* subtract uvbias from u_vec */ u_vec = vsubq_s16(u_vec, uvbias); /* form a vector of the V component from MP0 */ v_vec = vdupq_lane_s16(yuyv, MP0_V); /* subtract uvbias from v_vec */ v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp0_rgba = y_vec + u_vec + v_vec */ mp0_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* ---------- process macropixel 1 --------------- */ /* take macropixel one ([YUYV]1) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp1_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]1 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP1_Y0); y1_vec = vdup_lane_u8(rawpixels, MP1_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_high_s16(widerpixels); u_vec = vdupq_lane_s16(yuyv, 1); u_vec = vsubq_s16(u_vec, uvbias); v_vec = vdupq_lane_s16(yuyv, 3); v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp1_rgba = y_vec + u_vec + v_vec */ mp1_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* turn mp0_rgba from a vector of shorts to a vector of */ /* unsigned unsigned chars. this will saturate: clipping */ /* the values between 0 and 255. */ rgba0 = vqmovun_s16(mp0_rgba); rgba1 = vqmovun_s16(mp1_rgba); /* make it faster to copy these back out of vector registers into */ /* memory by combining rgba0 and rgba1 into the larger bothrgba. */ /* then store that back into memory at destinationp. */ bothrgba = vcombine_u8(rgba0, rgba1); vst1q_u8(destinationp, bothrgba); destinationp += 16; } }
bool decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height, unsigned char fill_alpha=0xff) { // pre-condition : width, height must be even if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv) return false; // in & out pointers unsigned char* dst = out; // constants int const stride = width*trait::bytes_per_pixel; int const itHeight = height>>1; int const itWidth = width>>3; uint8x8_t const Yshift = vdup_n_u8(16); int16x8_t const half = vdupq_n_u16(128); int32x4_t const rounding = vdupq_n_s32(128); // tmp variable uint16x8_t t; // pixel block to temporary store 8 pixels typename trait::PixelBlock pblock = trait::init_pixelblock(fill_alpha); for (int j=0; j<itHeight; ++j, y+=width, dst+=stride) { for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*trait::bytes_per_pixel)) { t = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift)); int32x4_t const Y00 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298); int32x4_t const Y01 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298); t = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift)); int32x4_t const Y10 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298); int32x4_t const Y11 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298); // trait::loadvu pack 4 sets of uv into a uint8x8_t, layout : { v0,u0, v1,u1, v2,u2, v3,u3 } t = vsubq_s16((int16x8_t)vmovl_u8(trait::loadvu(uv)), half); // UV.val[0] : v0, v1, v2, v3 // UV.val[1] : u0, u1, u2, u3 int16x4x2_t const UV = vuzp_s16(vget_low_s16(t), vget_high_s16(t)); // tR : 128+409V // tG : 128-100U-208V // tB : 128+516U int32x4_t const tR = vmlal_n_s16(rounding, UV.val[0], 409); int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, UV.val[0], -208), UV.val[1], -100); int32x4_t const tB = vmlal_n_s16(rounding, UV.val[1], 516); int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3] int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3] int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3] // upper 8 pixels trait::store_pixel_block(dst, pblock, vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8)); // lower 8 pixels trait::store_pixel_block(dst+stride, pblock, vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8)); } } return true; }
static INLINE void IDCT8x8_1D( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, int16x8_t *q11s16, int16x8_t *q12s16, int16x8_t *q13s16, int16x8_t *q14s16, int16x8_t *q15s16) { int16x4_t d0s16, d1s16, d2s16, d3s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; d0s16 = vdup_n_s16(cospi_28_64); d1s16 = vdup_n_s16(cospi_4_64); d2s16 = vdup_n_s16(cospi_12_64); d3s16 = vdup_n_s16(cospi_20_64); d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); d20s16 = vget_low_s16(*q10s16); d21s16 = vget_high_s16(*q10s16); d22s16 = vget_low_s16(*q11s16); d23s16 = vget_high_s16(*q11s16); d24s16 = vget_low_s16(*q12s16); d25s16 = vget_high_s16(*q12s16); d26s16 = vget_low_s16(*q13s16); d27s16 = vget_high_s16(*q13s16); d28s16 = vget_low_s16(*q14s16); d29s16 = vget_high_s16(*q14s16); d30s16 = vget_low_s16(*q15s16); d31s16 = vget_high_s16(*q15s16); q2s32 = vmull_s16(d18s16, d0s16); q3s32 = vmull_s16(d19s16, d0s16); q5s32 = vmull_s16(d26s16, d2s16); q6s32 = vmull_s16(d27s16, d2s16); q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); d8s16 = vqrshrn_n_s32(q2s32, 14); d9s16 = vqrshrn_n_s32(q3s32, 14); d10s16 = vqrshrn_n_s32(q5s32, 14); d11s16 = vqrshrn_n_s32(q6s32, 14); q4s16 = vcombine_s16(d8s16, d9s16); q5s16 = vcombine_s16(d10s16, d11s16); q2s32 = vmull_s16(d18s16, d1s16); q3s32 = vmull_s16(d19s16, d1s16); q9s32 = vmull_s16(d26s16, d3s16); q13s32 = vmull_s16(d27s16, d3s16); q2s32 = vmlal_s16(q2s32, d30s16, d0s16); q3s32 = vmlal_s16(q3s32, d31s16, d0s16); q9s32 = vmlal_s16(q9s32, d22s16, d2s16); q13s32 = vmlal_s16(q13s32, d23s16, d2s16); d14s16 = vqrshrn_n_s32(q2s32, 14); d15s16 = vqrshrn_n_s32(q3s32, 14); d12s16 = vqrshrn_n_s32(q9s32, 14); d13s16 = vqrshrn_n_s32(q13s32, 14); q6s16 = vcombine_s16(d12s16, d13s16); q7s16 = vcombine_s16(d14s16, d15s16); d0s16 = vdup_n_s16(cospi_16_64); q2s32 = vmull_s16(d16s16, d0s16); q3s32 = vmull_s16(d17s16, d0s16); q13s32 = vmull_s16(d16s16, d0s16); q15s32 = vmull_s16(d17s16, d0s16); q2s32 = vmlal_s16(q2s32, d24s16, d0s16); q3s32 = vmlal_s16(q3s32, d25s16, d0s16); q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); d0s16 = vdup_n_s16(cospi_24_64); d1s16 = vdup_n_s16(cospi_8_64); d18s16 = vqrshrn_n_s32(q2s32, 14); d19s16 = vqrshrn_n_s32(q3s32, 14); d22s16 = vqrshrn_n_s32(q13s32, 14); d23s16 = vqrshrn_n_s32(q15s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); *q11s16 = vcombine_s16(d22s16, d23s16); q2s32 = vmull_s16(d20s16, d0s16); q3s32 = vmull_s16(d21s16, d0s16); q8s32 = vmull_s16(d20s16, d1s16); q12s32 = vmull_s16(d21s16, d1s16); q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); q8s32 = vmlal_s16(q8s32, d28s16, d0s16); q12s32 = vmlal_s16(q12s32, d29s16, d0s16); d26s16 = vqrshrn_n_s32(q2s32, 14); d27s16 = vqrshrn_n_s32(q3s32, 14); d30s16 = vqrshrn_n_s32(q8s32, 14); d31s16 = vqrshrn_n_s32(q12s32, 14); *q13s16 = vcombine_s16(d26s16, d27s16); *q15s16 = vcombine_s16(d30s16, d31s16); q0s16 = vaddq_s16(*q9s16, *q15s16); q1s16 = vaddq_s16(*q11s16, *q13s16); q2s16 = vsubq_s16(*q11s16, *q13s16); q3s16 = vsubq_s16(*q9s16, *q15s16); *q13s16 = vsubq_s16(q4s16, q5s16); q4s16 = vaddq_s16(q4s16, q5s16); *q14s16 = vsubq_s16(q7s16, q6s16); q7s16 = vaddq_s16(q7s16, q6s16); d26s16 = vget_low_s16(*q13s16); d27s16 = vget_high_s16(*q13s16); d28s16 = vget_low_s16(*q14s16); d29s16 = vget_high_s16(*q14s16); d16s16 = vdup_n_s16(cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); q11s32 = vmull_s16(d28s16, d16s16); q12s32 = vmull_s16(d29s16, d16s16); q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); q11s32 = vmlal_s16(q11s32, d26s16, d16s16); q12s32 = vmlal_s16(q12s32, d27s16, d16s16); d10s16 = vqrshrn_n_s32(q9s32, 14); d11s16 = vqrshrn_n_s32(q10s32, 14); d12s16 = vqrshrn_n_s32(q11s32, 14); d13s16 = vqrshrn_n_s32(q12s32, 14); q5s16 = vcombine_s16(d10s16, d11s16); q6s16 = vcombine_s16(d12s16, d13s16); *q8s16 = vaddq_s16(q0s16, q7s16); *q9s16 = vaddq_s16(q1s16, q6s16); *q10s16 = vaddq_s16(q2s16, q5s16); *q11s16 = vaddq_s16(q3s16, q4s16); *q12s16 = vsubq_s16(q3s16, q4s16); *q13s16 = vsubq_s16(q2s16, q5s16); *q14s16 = vsubq_s16(q1s16, q6s16); *q15s16 = vsubq_s16(q0s16, q7s16); return; }
static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, int16x8_t *a6, int16x8_t *a7) { const int16x8_t b0 = vaddq_s16(*a0, *a1); const int16x8_t b1 = vsubq_s16(*a0, *a1); const int16x8_t b2 = vaddq_s16(*a2, *a3); const int16x8_t b3 = vsubq_s16(*a2, *a3); const int16x8_t b4 = vaddq_s16(*a4, *a5); const int16x8_t b5 = vsubq_s16(*a4, *a5); const int16x8_t b6 = vaddq_s16(*a6, *a7); const int16x8_t b7 = vsubq_s16(*a6, *a7); const int16x8_t c0 = vaddq_s16(b0, b2); const int16x8_t c1 = vaddq_s16(b1, b3); const int16x8_t c2 = vsubq_s16(b0, b2); const int16x8_t c3 = vsubq_s16(b1, b3); const int16x8_t c4 = vaddq_s16(b4, b6); const int16x8_t c5 = vaddq_s16(b5, b7); const int16x8_t c6 = vsubq_s16(b4, b6); const int16x8_t c7 = vsubq_s16(b5, b7); *a0 = vaddq_s16(c0, c4); *a1 = vsubq_s16(c2, c6); *a2 = vsubq_s16(c0, c4); *a3 = vaddq_s16(c2, c6); *a4 = vaddq_s16(c3, c7); *a5 = vsubq_s16(c3, c7); *a6 = vsubq_s16(c1, c5); *a7 = vaddq_s16(c1, c5); }