// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A0 = vld1q_u8(above); // top row const uint8x16_t A1 = vld1q_u8(above + 16); const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top const uint16x8_t p1 = vpaddlq_u8(A1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_top = vcombine_u16(p5, p5); } if (do_left) { const uint8x16_t L0 = vld1q_u8(left); // left row const uint8x16_t L1 = vld1q_u8(left + 16); const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left const uint16x8_t p1 = vpaddlq_u8(L1); const uint16x8_t p2 = vaddq_u16(p0, p1); const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); const uint16x4_t p4 = vpadd_u16(p3, p3); const uint16x4_t p5 = vpadd_u16(p4, p4); sum_left = vcombine_u16(p5, p5); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 6); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 5); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 5); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 32; ++i) { vst1q_u8(dst + i * stride, dc); vst1q_u8(dst + i * stride + 16, dc); } } }
void test_vcombineu16 (void) { uint16x8_t out_uint16x8_t; uint16x4_t arg0_uint16x4_t; uint16x4_t arg1_uint16x4_t; out_uint16x8_t = vcombine_u16 (arg0_uint16x4_t, arg1_uint16x4_t); }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x16_t A = vld1q_u8(above); // top row const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_top = vcombine_u16(p3, p3); } if (do_left) { const uint8x16_t L = vld1q_u8(left); // left row const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); const uint16x4_t p2 = vpadd_u16(p1, p1); const uint16x4_t p3 = vpadd_u16(p2, p2); sum_left = vcombine_u16(p3, p3); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 5); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 4); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 4); } else { dc0 = vdup_n_u8(0x80); } { const uint8x16_t dc = vdupq_lane_u8(dc0, 0); int i; for (i = 0; i < 16; ++i) { vst1q_u8(dst + i * stride, dc); } } }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x8_t A = vld1_u8(above); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_top = vcombine_u16(p2, p2); } if (do_left) { const uint8x8_t L = vld1_u8(left); // left border const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_left = vcombine_u16(p2, p2); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 4); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 3); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 3); } else { dc0 = vdup_n_u8(0x80); } { const uint8x8_t dc = vdup_lane_u8(dc0, 0); int i; for (i = 0; i < 8; ++i) { vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); } } }
// res is in reverse row order static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, const int stride, const int16x8_t res, const int16x8_t max) { const uint16x4_t a0 = vld1_u16(*dest); const uint16x4_t a1 = vld1_u16(*dest + stride); const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0)); // Note: In some profile tests, res is quite close to +/-32767. // We use saturating addition. const int16x8_t b = vqaddq_s16(res, a); const int16x8_t c = vminq_s16(b, max); const uint16x8_t d = vqshluq_n_s16(c, 0); vst1_u16(*dest, vget_high_u16(d)); *dest += stride; vst1_u16(*dest, vget_low_u16(d)); *dest += stride; }
void compare_neon_ge(float *psrc1, float src2, uchar *pdst, int size) { int remainder = size - 7; float32x4_t vsrc2 = vdupq_n_f32(src2); int i = 0; for(; i < remainder; i += 8){ float32x4_t vsrc1_32x4 = vld1q_f32(psrc1 + i ); float32x4_t vsrc2_32x4 = vld1q_f32(psrc1 + i + 4); uint32x4_t vdst1 = vcgeq_f32(vsrc1_32x4, vsrc2); uint32x4_t vdst2 = vcgeq_f32(vsrc2_32x4, vsrc2); uint16x4_t vdst1_16x4 = vmovn_u32(vdst1); uint16x4_t vdst2_16x4 = vmovn_u32(vdst2); uint16x8_t vdst_16x8 = vcombine_u16(vdst1_16x4, vdst2_16x4); uint8x8_t vdst_8x8 = vmovn_u16(vdst_16x8); vst1_u8(pdst + i, vdst_8x8); } for( ; i < size; i++){ pdst[i] = (psrc1[i] >= src2 ) ? 255 : 0; } }
static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R, const uint8x8_t G, const uint8x8_t B) { const uint16x8_t r = vmovl_u8(R); const uint16x8_t g = vmovl_u8(G); const uint16x8_t b = vmovl_u8(B); const uint16x4_t r_lo = vget_low_u16(r); const uint16x4_t r_hi = vget_high_u16(r); const uint16x4_t g_lo = vget_low_u16(g); const uint16x4_t g_hi = vget_high_u16(g); const uint16x4_t b_lo = vget_low_u16(b); const uint16x4_t b_hi = vget_high_u16(b); const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u); const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u); const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u); const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u); const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u); const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u); const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16), vrshrn_n_u32(tmp2_hi, 16)); const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16)); return vqmovn_u16(Y2); }
inline void vnst(u8* dst, uint32x4_t v1, uint32x4_t v2) { vst1_u8(dst, vmovn_u16(vcombine_u16(vmovn_u32(v1), vmovn_u32(v2)))); }
bool decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height, unsigned char fill_alpha=0xff) { // pre-condition : width, height must be even if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv) return false; // in & out pointers unsigned char* dst = out; // constants int const stride = width*trait::bytes_per_pixel; int const itHeight = height>>1; int const itWidth = width>>3; uint8x8_t const Yshift = vdup_n_u8(16); int16x8_t const half = vdupq_n_u16(128); int32x4_t const rounding = vdupq_n_s32(128); // tmp variable uint16x8_t t; // pixel block to temporary store 8 pixels typename trait::PixelBlock pblock = trait::init_pixelblock(fill_alpha); for (int j=0; j<itHeight; ++j, y+=width, dst+=stride) { for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*trait::bytes_per_pixel)) { t = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift)); int32x4_t const Y00 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298); int32x4_t const Y01 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298); t = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift)); int32x4_t const Y10 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298); int32x4_t const Y11 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298); // trait::loadvu pack 4 sets of uv into a uint8x8_t, layout : { v0,u0, v1,u1, v2,u2, v3,u3 } t = vsubq_s16((int16x8_t)vmovl_u8(trait::loadvu(uv)), half); // UV.val[0] : v0, v1, v2, v3 // UV.val[1] : u0, u1, u2, u3 int16x4x2_t const UV = vuzp_s16(vget_low_s16(t), vget_high_s16(t)); // tR : 128+409V // tG : 128-100U-208V // tB : 128+516U int32x4_t const tR = vmlal_n_s16(rounding, UV.val[0], 409); int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, UV.val[0], -208), UV.val[1], -100); int32x4_t const tB = vmlal_n_s16(rounding, UV.val[1], 516); int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3] int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3] int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3] // upper 8 pixels trait::store_pixel_block(dst, pblock, vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8)); // lower 8 pixels trait::store_pixel_block(dst+stride, pblock, vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8), vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8)); } } return true; }
// CHECK-LABEL: define <8 x i16> @test_vcombine_u16(<4 x i16> %low, <4 x i16> %high) #0 { // CHECK: [[SHUFFLE_I:%.*]] = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> // CHECK: ret <8 x i16> [[SHUFFLE_I]] uint16x8_t test_vcombine_u16(uint16x4_t low, uint16x4_t high) { return vcombine_u16(low, high); }
void computeNetwork0new_neon(const float *dataf, const float *weightsf, uint8_t *d) { const int16_t *data = (const int16_t *)dataf; const int16_t *weights = (const int16_t *)weightsf; int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int i = 0; i < 128/2; i += 8) { int16x4x2_t d0 = vld2_s16(data + i); int16x4x2_t w0 = vld2_s16(weights + i * 4); int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8); int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16); int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t m0 = vcvtq_f32_s32(sum); m0 = vmulq_f32(m0, vld1q_f32(weightsf + 512/4)); m0 = vaddq_f32(m0, vld1q_f32(weightsf + 528/4)); float32x4_t m1, m2, m3, m4; m1 = m0; m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f)); m0 = vaddq_f32(m0, ones_f); m0 = vmulq_f32(reciprocal(m0), m1); m1 = vdupq_lane_f32(vget_low_f32(m0), 0); m2 = vdupq_lane_f32(vget_low_f32(m0), 1); m3 = vdupq_lane_f32(vget_high_f32(m0), 0); m4 = vdupq_lane_f32(vget_high_f32(m0), 1); m1 = vmulq_f32(m1, vld1q_f32(weightsf + 544/4)); m2 = vmulq_f32(m2, vld1q_f32(weightsf + 560/4)); m3 = vmulq_f32(m3, vld1q_f32(weightsf + 576/4)); m4 = vmulq_f32(m4, vld1q_f32(weightsf + 592/4)); m1 = vaddq_f32(m1, m2); m3 = vaddq_f32(m3, m4); m1 = vaddq_f32(m1, m3); m1 = vaddq_f32(m1, vld1q_f32(weightsf + 608/4)); uint32x4_t gte = vcgeq_f32(m1, zeroes_f); uint16x4_t gte_u16 = vmovn_u32(gte); uint8x8_t gte_u8 = vmovn_u16(vcombine_u16(gte_u16, vget_low_u16(vreinterpretq_u16_u32(sign_bits_f)))); gte_u8 = vshr_n_u8(gte_u8, 7); vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(gte_u8), 0); }
void vpx_highbd_convolve_avg_neon(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, const int16_t *filter_y, int filter_y_stride, int w, int h, int bd) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); (void)filter_x; (void)filter_x_stride; (void)filter_y; (void)filter_y_stride; (void)bd; if (w < 8) { // avg4 uint16x4_t s0, s1, d0, d1; uint16x8_t s01, d01; do { s0 = vld1_u16(src); d0 = vld1_u16(dst); src += src_stride; s1 = vld1_u16(src); d1 = vld1_u16(dst + dst_stride); src += src_stride; s01 = vcombine_u16(s0, s1); d01 = vcombine_u16(d0, d1); d01 = vrhaddq_u16(s01, d01); vst1_u16(dst, vget_low_u16(d01)); dst += dst_stride; vst1_u16(dst, vget_high_u16(d01)); dst += dst_stride; h -= 2; } while (h > 0); } else if (w == 8) { // avg8 uint16x8_t s0, s1, d0, d1; do { s0 = vld1q_u16(src); d0 = vld1q_u16(dst); src += src_stride; s1 = vld1q_u16(src); d1 = vld1q_u16(dst + dst_stride); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); vst1q_u16(dst, d0); dst += dst_stride; vst1q_u16(dst, d1); dst += dst_stride; h -= 2; } while (h > 0); } else if (w < 32) { // avg16 uint16x8_t s0l, s0h, s1l, s1h, d0l, d0h, d1l, d1h; do { s0l = vld1q_u16(src); s0h = vld1q_u16(src + 8); d0l = vld1q_u16(dst); d0h = vld1q_u16(dst + 8); src += src_stride; s1l = vld1q_u16(src); s1h = vld1q_u16(src + 8); d1l = vld1q_u16(dst + dst_stride); d1h = vld1q_u16(dst + dst_stride + 8); src += src_stride; d0l = vrhaddq_u16(s0l, d0l); d0h = vrhaddq_u16(s0h, d0h); d1l = vrhaddq_u16(s1l, d1l); d1h = vrhaddq_u16(s1h, d1h); vst1q_u16(dst, d0l); vst1q_u16(dst + 8, d0h); dst += dst_stride; vst1q_u16(dst, d1l); vst1q_u16(dst + 8, d1h); dst += dst_stride; h -= 2; } while (h > 0); } else if (w == 32) { // avg32 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; do { s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); dst += dst_stride; s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); src += src_stride; d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); dst += dst_stride; h -= 2; } while (h > 0); } else { // avg64 uint16x8_t s0, s1, s2, s3, d0, d1, d2, d3; do { s0 = vld1q_u16(src); s1 = vld1q_u16(src + 8); s2 = vld1q_u16(src + 16); s3 = vld1q_u16(src + 24); d0 = vld1q_u16(dst); d1 = vld1q_u16(dst + 8); d2 = vld1q_u16(dst + 16); d3 = vld1q_u16(dst + 24); d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst, d0); vst1q_u16(dst + 8, d1); vst1q_u16(dst + 16, d2); vst1q_u16(dst + 24, d3); s0 = vld1q_u16(src + 32); s1 = vld1q_u16(src + 40); s2 = vld1q_u16(src + 48); s3 = vld1q_u16(src + 56); d0 = vld1q_u16(dst + 32); d1 = vld1q_u16(dst + 40); d2 = vld1q_u16(dst + 48); d3 = vld1q_u16(dst + 56); d0 = vrhaddq_u16(s0, d0); d1 = vrhaddq_u16(s1, d1); d2 = vrhaddq_u16(s2, d2); d3 = vrhaddq_u16(s3, d3); vst1q_u16(dst + 32, d0); vst1q_u16(dst + 40, d1); vst1q_u16(dst + 48, d2); vst1q_u16(dst + 56, d3); src += src_stride; dst += dst_stride; } while (--h); } }
inline uint16x8_t vcombine(const uint16x4_t & v0, const uint16x4_t & v1) { return vcombine_u16(v0, v1); }
void phase(const Size2D &size, const s16 * src0Base, ptrdiff_t src0Stride, const s16 * src1Base, ptrdiff_t src1Stride, u8 * dstBase, ptrdiff_t dstStride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON FASTATAN2CONST(256.0f / 360.0f) size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; float32x4_t v_05 = vdupq_n_f32(0.5f); for (size_t i = 0; i < size.height; ++i) { const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i); const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); u8 * dst = internal::getRowPtr(dstBase, dstStride, i); size_t j = 0; for (; j < roiw16; j += 16) { internal::prefetch(src0 + j); internal::prefetch(src1 + j); int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8); int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); // 0 float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); float32x4_t v_dst32f0; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); float32x4_t v_dst32f1; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); // 1 v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11))); FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11))); FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0), vmovn_u16(v_dst16s1))); } for (; j < roiw8; j += 8) { int16x8_t v_src0 = vld1q_s16(src0 + j); int16x8_t v_src1 = vld1q_s16(src1 + j); float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0))); float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))); float32x4_t v_dst32f0; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))); float32x4_t v_dst32f1; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); vst1_u8(dst + j, vmovn_u16(v_dst)); } for (; j < size.width; j++) { f32 x = src0[j], y = src1[j]; f32 a; FASTATAN2SCALAR(y, x, a) dst[j] = (u8)(s32)floor(a + 0.5f); } } #else (void)size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; (void)dstBase; (void)dstStride; #endif }
void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, // unused int x_step_q4, // unused const int16_t *filter_y, int y_step_q4, int w, int h) { int height; const uint8_t *s; uint8_t *d; uint32x2_t d2u32, d3u32; uint32x2_t d16u32, d18u32, d20u32, d22u32, d24u32, d26u32; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16; int16x4_t d24s16, d25s16, d26s16, d27s16; uint16x4_t d2u16, d3u16, d4u16, d5u16; int16x8_t q0s16; uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; int32x4_t q1s32, q2s32, q14s32, q15s32; assert(y_step_q4 == 16); (void)x_step_q4; (void)y_step_q4; (void)filter_x; src -= src_stride * 3; q0s16 = vld1q_s16(filter_y); for (; w > 0; w -= 4, src += 4, dst += 4) { // loop_vert_h s = src; d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 0); s += src_stride; d16u32 = vld1_lane_u32((const uint32_t *)s, d16u32, 1); s += src_stride; d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 0); s += src_stride; d18u32 = vld1_lane_u32((const uint32_t *)s, d18u32, 1); s += src_stride; d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 0); s += src_stride; d20u32 = vld1_lane_u32((const uint32_t *)s, d20u32, 1); s += src_stride; d22u32 = vld1_lane_u32((const uint32_t *)s, d22u32, 0); s += src_stride; q8u16 = vmovl_u8(vreinterpret_u8_u32(d16u32)); q9u16 = vmovl_u8(vreinterpret_u8_u32(d18u32)); q10u16 = vmovl_u8(vreinterpret_u8_u32(d20u32)); q11u16 = vmovl_u8(vreinterpret_u8_u32(d22u32)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d = dst; for (height = h; height > 0; height -= 4) { // loop_vert d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 0); s += src_stride; d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 0); s += src_stride; d26u32 = vld1_lane_u32((const uint32_t *)s, d26u32, 1); s += src_stride; d24u32 = vld1_lane_u32((const uint32_t *)s, d24u32, 1); s += src_stride; q12u16 = vmovl_u8(vreinterpret_u8_u32(d24u32)); q13u16 = vmovl_u8(vreinterpret_u8_u32(d26u32)); d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); __builtin_prefetch(d); __builtin_prefetch(d + dst_stride); q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, q0s16); __builtin_prefetch(d + dst_stride * 2); __builtin_prefetch(d + dst_stride * 3); q2s32 = MULTIPLY_BY_Q0(d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, q0s16); __builtin_prefetch(s); __builtin_prefetch(s + src_stride); q14s32 = MULTIPLY_BY_Q0(d18s16, d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, d27s16, q0s16); __builtin_prefetch(s + src_stride * 2); __builtin_prefetch(s + src_stride * 3); q15s32 = MULTIPLY_BY_Q0(d19s16, d20s16, d21s16, d22s16, d24s16, d26s16, d27s16, d25s16, q0s16); d2u16 = vqrshrun_n_s32(q1s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7); d4u16 = vqrshrun_n_s32(q14s32, 7); d5u16 = vqrshrun_n_s32(q15s32, 7); q1u16 = vcombine_u16(d2u16, d3u16); q2u16 = vcombine_u16(d4u16, d5u16); d2u32 = vreinterpret_u32_u8(vqmovn_u16(q1u16)); d3u32 = vreinterpret_u32_u8(vqmovn_u16(q2u16)); vst1_lane_u32((uint32_t *)d, d2u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d2u32, 1); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 1); d += dst_stride; q8u16 = q10u16; d18s16 = d22s16; d19s16 = d24s16; q10u16 = q13u16; d22s16 = d25s16; } } return; }
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, // unused int y_step_q4, // unused int w, int h) { int width; const uint8_t *s, *psrc; uint8_t *d, *pdst; uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; uint8x16_t q12u8, q13u8, q14u8, q15u8; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16; uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; int16x8_t q0s16; uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; int32x4_t q1s32, q2s32, q14s32, q15s32; uint16x8x2_t q0x2u16; uint8x8x2_t d0x2u8, d1x2u8; uint32x2x2_t d0x2u32; uint16x4x2_t d0x2u16, d1x2u16; uint32x4x2_t q0x2u32; assert(x_step_q4 == 16); (void)x_step_q4; (void)y_step_q4; (void)filter_y; q0s16 = vld1q_s16(filter_x); src -= 3; // adjust for taps for (; h > 0; h -= 4, src += src_stride * 4, dst += dst_stride * 4) { // loop_horiz_v s = src; d24u8 = vld1_u8(s); s += src_stride; d25u8 = vld1_u8(s); s += src_stride; d26u8 = vld1_u8(s); s += src_stride; d27u8 = vld1_u8(s); q12u8 = vcombine_u8(d24u8, d25u8); q13u8 = vcombine_u8(d26u8, d27u8); q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); d0x2u8 = vtrn_u8(d24u8, d25u8); d1x2u8 = vtrn_u8(d26u8, d27u8); __builtin_prefetch(src + src_stride * 4); __builtin_prefetch(src + src_stride * 5); __builtin_prefetch(src + src_stride * 6); q8u16 = vmovl_u8(d0x2u8.val[0]); q9u16 = vmovl_u8(d0x2u8.val[1]); q10u16 = vmovl_u8(d1x2u8.val[0]); q11u16 = vmovl_u8(d1x2u8.val[1]); d16u16 = vget_low_u16(q8u16); d17u16 = vget_high_u16(q8u16); d18u16 = vget_low_u16(q9u16); d19u16 = vget_high_u16(q9u16); q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 q9u16 = vcombine_u16(d17u16, d19u16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 for (width = w, psrc = src + 7, pdst = dst; width > 0; width -= 4, psrc += 4, pdst += 4) { // loop_horiz s = psrc; d28u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d29u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d31u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d30u32 = vld1_dup_u32((const uint32_t *)s); __builtin_prefetch(psrc + 64); d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 vreinterpret_u8_u16(d1x2u16.val[0])); // d29 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 vreinterpret_u8_u16(d1x2u16.val[1])); // d30 __builtin_prefetch(psrc + 64 + src_stride); q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); q12u16 = vmovl_u8(d28u8); q13u16 = vmovl_u8(d29u8); __builtin_prefetch(psrc + 64 + src_stride * 2); d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, q0s16); q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, q0s16); q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, q0s16); q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, d25s16, q0s16); __builtin_prefetch(psrc + 60 + src_stride * 3); d2u16 = vqrshrun_n_s32(q1s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7); d4u16 = vqrshrun_n_s32(q14s32, 7); d5u16 = vqrshrun_n_s32(q15s32, 7); q1u16 = vcombine_u16(d2u16, d3u16); q2u16 = vcombine_u16(d4u16, d5u16); d2u8 = vqmovn_u16(q1u16); d3u8 = vqmovn_u16(q2u16); d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), vreinterpret_u32_u16(d0x2u16.val[1])); d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), vreinterpret_u8_u32(d0x2u32.val[1])); d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); d = pdst; vst1_lane_u32((uint32_t *)d, d2u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d2u32, 1); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 1); q8u16 = q9u16; d20s16 = d23s16; q11u16 = q12u16; q9u16 = q13u16; d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); } } return; }
// // box blur a square array of pixels (power of 2, actually) // if we insist on powers of 2, we don't need to special case some end-of-row/col conditions // to a specific blur width // // also, we're using NEON to vectorize our arithmetic. // we need to do a division along the way, but NEON doesn't support integer division. // so rather than divide by, say "w", we multiply by magic(w). // magic(w) is chosen so that the result of multiplying by it will be the same as // dividing by w, except that the result will be in the high half of the result. // yes, dorothy... this is what compilers do, too... void NEONboxBlur(pixel *src, pixel *dest, unsigned int size, unsigned int blurRad) { unsigned int wid = 2 * blurRad + 1; // because NEON doesn't have integer division, we use "magic constants" that will give // use the result of division by multiplication -- the upper half of the result will be // (more or less) the result of the division. // for this, we need to compute the magic numbers corresponding to a given divisor struct magicu_info minfo = compute_unsigned_magic_info(wid, 16); int16x8_t preshift = vdupq_n_s16(-minfo.pre_shift); // negative means shift right int32x4_t postshift = vdupq_n_s32(-(minfo.post_shift+16)); // negative means shift right uint16x4_t magic = vdup_n_u16(minfo.multiplier); // fprintf(stderr,"width %5d, preshift %d, postshift %d + 16, increment %d, magic %d\n", wid, // minfo.pre_shift, minfo.post_shift, minfo.increment, minfo.multiplier); // if (minfo.pre_shift > 0) fprintf(stderr,"hey, not an odd number!\n"); int i, j, k, ch; for (i = 0 ; i < size ; i+=8) { // first, initialize the sum so that we can loop from 0 to size-1 // we'll initialize boxsum for index -1, so that we can move into 0 as part of our loop uint16x8x4_t boxsum; uint8x8x4_t firstpixel = vld4_u8((uint8_t *)(src + 0 * size + i)); for (ch = 0 ; ch < 4 ; ch++) { // boxsum[ch] = blurRad * srcpixel[ch] boxsum.val[ch] = vmulq_n_u16(vmovl_u8(firstpixel.val[ch]),(blurRad+1)+1); } for ( k = 1 ; k < blurRad ; k++) { uint8x8x4_t srcpixel = vld4_u8((uint8_t *)(src + k * size + i)); for (ch = 0 ; ch < 4 ; ch++ ) { boxsum.val[ch] = vaddw_u8(boxsum.val[ch], srcpixel.val[ch]); } } int right = blurRad-1; int left = -blurRad-1; if (minfo.increment) { for ( k = 0 ; k < size ; k++) { // move to next pixel unsigned int l = (left < 0)?0:left; // take off the old left left++; right++; unsigned int r = (right < size)?right:(size-1); // but add the new right uint8x8x4_t addpixel = vld4_u8((uint8_t *)(src + r * size + i)); uint8x8x4_t subpixel = vld4_u8((uint8_t *)(src + l * size + i)); for (ch = 0 ; ch < 4 ; ch++ ) { // boxsum[ch] += addpixel[ch] - subpixel[ch]; boxsum.val[ch] = vsubw_u8(vaddw_u8(boxsum.val[ch], addpixel.val[ch]), subpixel.val[ch]); } uint8x8x4_t destpixel; for (ch = 0 ; ch < 4 ; ch++ ) { // compute: destpixel = boxsum / wid // since 16bit multiplication leads to 32bit results, we need to // split our task into two chunks, for the hi and low half of our vector // (because otherwise, it won't all fit into 128 bits) // this is the meat of the magic division algorithm (see the include file...) uint16x8_t bsum_preshifted = vshlq_u16(boxsum.val[ch],preshift); // multiply by the magic number uint32x4_t res_hi = vmull_u16(vget_high_u16(bsum_preshifted), magic); res_hi = vaddw_u16(res_hi, magic); // take the high half and post-shift uint16x4_t q_hi = vmovn_u32(vshlq_u32(res_hi, postshift)); // pre-shift and multiply by the magic number uint32x4_t res_lo = vmull_u16(vget_low_u16(bsum_preshifted), magic); res_lo = vaddw_u16(res_lo, magic); // take the high half and post-shift uint16x4_t q_lo = vmovn_u32(vshlq_u32(res_lo, postshift)); destpixel.val[ch] = vqmovn_u16(vcombine_u16(q_lo, q_hi)); } pixel block[8]; vst4_u8((uint8_t *)&block, destpixel); for (j = 0 ; j < 8 ; j++ ) { dest[(i + j)*size + k] = block[j]; } // vst4_u8((uint8_t *)(dest + k * size + i), destpixel); } } else { for ( k = 0 ; k < size ; k++) { // move to next pixel unsigned int l = (left < 0)?0:left; // take off the old left left++; right++; unsigned int r = (right < size)?right:(size-1); // but add the new right uint8x8x4_t addpixel = vld4_u8((uint8_t *)(src + r * size + i)); uint8x8x4_t subpixel = vld4_u8((uint8_t *)(src + l * size + i)); for (ch = 0 ; ch < 4 ; ch++ ) { // boxsum[ch] += addpixel[ch] - subpixel[ch]; boxsum.val[ch] = vsubw_u8(vaddw_u8(boxsum.val[ch], addpixel.val[ch]), subpixel.val[ch]); } uint8x8x4_t destpixel; for (ch = 0 ; ch < 4 ; ch++ ) { // compute: destpixel = boxsum / wid // since 16bit multiplication leads to 32bit results, we need to // split our task into two chunks, for the hi and low half of our vector // (because otherwise, it won't all fit into 128 bits) // this is the meat of the magic division algorithm (see the include file...) uint16x8_t bsum_preshifted = vshlq_u16(boxsum.val[ch],preshift); // multiply by the magic number // take the high half and post-shift uint32x4_t res_hi = vmull_u16(vget_high_u16(bsum_preshifted), magic); uint16x4_t q_hi = vmovn_u32(vshlq_u32(res_hi, postshift)); // multiply by the magic number // take the high half and post-shift uint32x4_t res_lo = vmull_u16(vget_low_u16(bsum_preshifted), magic); uint16x4_t q_lo = vmovn_u32(vshlq_u32(res_lo, postshift)); destpixel.val[ch] = vqmovn_u16(vcombine_u16(q_lo, q_hi)); } pixel block[8]; vst4_u8((uint8_t *)&block, destpixel); for (j = 0 ; j < 8 ; j++ ) { dest[(i + j)*size + k] = block[j]; } // vst4_u8((uint8_t *)(dest + k * size + i), destpixel); } } } }
static INLINE void mbloop_filter_neon(uint8x8_t dblimit, // mblimit uint8x8_t dlimit, // limit uint8x8_t dthresh, // thresh uint8x8_t d3u8, // p2 uint8x8_t d4u8, // p2 uint8x8_t d5u8, // p1 uint8x8_t d6u8, // p0 uint8x8_t d7u8, // q0 uint8x8_t d16u8, // q1 uint8x8_t d17u8, // q2 uint8x8_t d18u8, // q3 uint8x8_t *d0ru8, // p1 uint8x8_t *d1ru8, // p1 uint8x8_t *d2ru8, // p0 uint8x8_t *d3ru8, // q0 uint8x8_t *d4ru8, // q1 uint8x8_t *d5ru8) { // q1 uint32_t flat; uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8; uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8; int16x8_t q15s16; uint16x8_t q10u16, q14u16; int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8; d19u8 = vabd_u8(d3u8, d4u8); d20u8 = vabd_u8(d4u8, d5u8); d21u8 = vabd_u8(d5u8, d6u8); d22u8 = vabd_u8(d16u8, d7u8); d23u8 = vabd_u8(d17u8, d16u8); d24u8 = vabd_u8(d18u8, d17u8); d19u8 = vmax_u8(d19u8, d20u8); d20u8 = vmax_u8(d21u8, d22u8); d25u8 = vabd_u8(d6u8, d4u8); d23u8 = vmax_u8(d23u8, d24u8); d26u8 = vabd_u8(d7u8, d17u8); d19u8 = vmax_u8(d19u8, d20u8); d24u8 = vabd_u8(d6u8, d7u8); d27u8 = vabd_u8(d3u8, d6u8); d28u8 = vabd_u8(d18u8, d7u8); d19u8 = vmax_u8(d19u8, d23u8); d23u8 = vabd_u8(d5u8, d16u8); d24u8 = vqadd_u8(d24u8, d24u8); d19u8 = vcge_u8(dlimit, d19u8); d25u8 = vmax_u8(d25u8, d26u8); d26u8 = vmax_u8(d27u8, d28u8); d23u8 = vshr_n_u8(d23u8, 1); d25u8 = vmax_u8(d25u8, d26u8); d24u8 = vqadd_u8(d24u8, d23u8); d20u8 = vmax_u8(d20u8, d25u8); d23u8 = vdup_n_u8(1); d24u8 = vcge_u8(dblimit, d24u8); d21u8 = vcgt_u8(d21u8, dthresh); d20u8 = vcge_u8(d23u8, d20u8); d19u8 = vand_u8(d19u8, d24u8); d23u8 = vcgt_u8(d22u8, dthresh); d20u8 = vand_u8(d20u8, d19u8); d22u8 = vdup_n_u8(0x80); d23u8 = vorr_u8(d21u8, d23u8); q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8)); d30u8 = vshrn_n_u16(q10u16, 4); flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0); if (flat == 0xffffffff) { // Check for all 1's, power_branch_only d27u8 = vdup_n_u8(3); d21u8 = vdup_n_u8(2); q14u16 = vaddl_u8(d6u8, d7u8); q14u16 = vmlal_u8(q14u16, d3u8, d27u8); q14u16 = vmlal_u8(q14u16, d4u8, d21u8); q14u16 = vaddw_u8(q14u16, d5u8); *d0ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d16u8); *d1ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d17u8); *d2ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d18u8); *d3ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vsubw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d18u8); *d4ru8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vsubw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d17u8); q14u16 = vaddw_u8(q14u16, d18u8); *d5ru8 = vqrshrn_n_u16(q14u16, 3); } else { d21u8 = veor_u8(d7u8, d22u8); d24u8 = veor_u8(d6u8, d22u8); d25u8 = veor_u8(d5u8, d22u8); d26u8 = veor_u8(d16u8, d22u8); d27u8 = vdup_n_u8(3); d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8)); d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8)); q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8)); d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8)); q15s16 = vaddw_s8(q15s16, d29s8); d29u8 = vdup_n_u8(4); d28s8 = vqmovn_s16(q15s16); d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8)); d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8)); d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8)); d30s8 = vshr_n_s8(d30s8, 3); d29s8 = vshr_n_s8(d29s8, 3); d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8); d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8); d29s8 = vrshr_n_s8(d29s8, 1); d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8)); d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8); d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8); if (flat == 0) { // filter_branch_only *d0ru8 = d4u8; *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); *d5ru8 = d17u8; return; } d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8); d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8); d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8); d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8); d23u8 = vdup_n_u8(2); q14u16 = vaddl_u8(d6u8, d7u8); q14u16 = vmlal_u8(q14u16, d3u8, d27u8); q14u16 = vmlal_u8(q14u16, d4u8, d23u8); d0u8 = vbsl_u8(d20u8, dblimit, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); d1u8 = vbsl_u8(d20u8, dlimit, d25u8); d30u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vaddw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d16u8); d2u8 = vbsl_u8(d20u8, dthresh, d24u8); d31u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vaddw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d17u8); *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8); d23u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d3u8); q14u16 = vsubw_u8(q14u16, d6u8); q14u16 = vaddw_u8(q14u16, d7u8); *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8); q14u16 = vaddw_u8(q14u16, d18u8); *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8); d22u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d4u8); q14u16 = vsubw_u8(q14u16, d7u8); q14u16 = vaddw_u8(q14u16, d16u8); d3u8 = vbsl_u8(d20u8, d3u8, d21u8); q14u16 = vaddw_u8(q14u16, d18u8); d4u8 = vbsl_u8(d20u8, d4u8, d26u8); d6u8 = vqrshrn_n_u16(q14u16, 3); q14u16 = vsubw_u8(q14u16, d5u8); q14u16 = vsubw_u8(q14u16, d16u8); q14u16 = vaddw_u8(q14u16, d17u8); q14u16 = vaddw_u8(q14u16, d18u8); d5u8 = vbsl_u8(d20u8, d5u8, d17u8); d7u8 = vqrshrn_n_u16(q14u16, 3); *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8); *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8); *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8); } return; }
int neon_new(DATA32* _p0, DATA32* _p1, DATA32* _p2, DATA32* _p3, DATA32* _ax, DATA32 _ay, DATA32* result, int len) { int ay = _ay; int i; DATA32* pbuf = result; uint16x4_t ay_16x4; uint16x4_t p0_16x4; uint16x4_t p2_16x4; uint16x8_t ax_16x8; uint16x8_t p0_p2_16x8; uint16x8_t p1_p3_16x8; uint16x8_t x255_16x8; uint32x2_t p0_p2_32x2; uint32x2_t p1_p3_32x2; uint32x2_t res_32x2; uint8x8_t p0_p2_8x8; uint8x8_t p1_p3_8x8; uint8x8_t p2_8x8; uint16x4_t temp_16x4; ay_16x4 = vdup_n_u16(ay); x255_16x8 = vdupq_n_u16(0xff); for(i = 0; i < len; i++) { DATA32 p0 = *_p0++; DATA32 p1 = *_p1++; DATA32 p2 = *_p2++; DATA32 p3 = *_p3++; int ax = *_ax++; if (p0 | p1 | p2 | p3) { ax_16x8 = vdupq_n_u16(ax); p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0); p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1); p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0); p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1); p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2); p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2); p1_p3_16x8 = vmovl_u8(p1_p3_8x8); p0_p2_16x8 = vmovl_u8(p0_p2_8x8); p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8); p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8); p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8); p0_16x4 = vget_low_u16(p1_p3_16x8); p2_16x4 = vget_high_u16(p1_p3_16x8); p2_16x4 = vsub_u16(p2_16x4, p0_16x4); p2_16x4 = vmul_u16(p2_16x4, ay_16x4); p2_16x4 = vshr_n_u16(p2_16x4, 8); p2_16x4 = vadd_u16(p2_16x4, p0_16x4); p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4); p2_8x8 = vmovn_u16(p1_p3_16x8); res_32x2 = vreinterpret_u32_u8(p2_8x8); vst1_lane_u32(pbuf++, res_32x2, 1); } else *pbuf++ = p0; } return 0; }