static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h, unsigned int *sse, int *sum) { int i, j; int16x8_t v_sum = vdupq_n_s16(0); int32x4_t v_sse_lo = vdupq_n_s32(0); int32x4_t v_sse_hi = vdupq_n_s32(0); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { const uint8x8_t v_a = vld1_u8(&a[j]); const uint8x8_t v_b = vld1_u8(&b[j]); const uint16x8_t v_diff = vsubl_u8(v_a, v_b); const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); v_sum = vaddq_s16(v_sum, sv_diff); v_sse_lo = vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); v_sse_hi = vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); } a += a_stride; b += b_stride; } *sum = horizontal_add_s16x8(v_sum); *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); }
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v, int src_width, int do_store) { int i; for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) { const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]); const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds const uint16x8_t G = vpaddlq_u8(RGB.val[1]); const uint16x8_t B = vpaddlq_u8(RGB.val[0]); int16x8_t U_tmp, V_tmp; CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp); { const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1); const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1); if (do_store) { vst1_u8(u, U); vst1_u8(v, V); } else { const uint8x8_t prev_u = vld1_u8(u); const uint8x8_t prev_v = vld1_u8(v); vst1_u8(u, vrhadd_u8(U, prev_u)); vst1_u8(v, vrhadd_u8(V, prev_v)); } } } if (i < src_width) { // left-over WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); } }
int main(void) { uint8_t v1_init[8] = {1, 1, 1, 1, 1, 1, 1, 1}; uint8_t v2_init[8] = {2, 2, 2, 2, 2, 2, 2, 2}; uint8x8_t v1 = vld1_u8 (v1_init); uint8x8_t v2 = vld1_u8 (v2_init); uint8x8x2_t vd1, vd2; union {uint8x8_t v; uint8_t buf[8];} d1, d2, d3, d4; int i; uint8_t odd, even; vd1 = vzip_u8(v1, vdup_n_u8(0)); vd2 = vzip_u8(v2, vdup_n_u8(0)); vst1_u8(d1.buf, vd1.val[0]); vst1_u8(d2.buf, vd1.val[1]); vst1_u8(d3.buf, vd2.val[0]); vst1_u8(d4.buf, vd2.val[1]); #ifdef __ARMEL__ odd = 1; even = 0; #else odd = 0; even = 1; #endif for (i = 0; i < 8; i++) if ((i % 2 == even && d4.buf[i] != 2) || (i % 2 == odd && d4.buf[i] != 0)) abort (); return 0; }
void byte2float48_neon(const uint8_t *t, const int pitch, float *p) { uint16x8_t m0, m1, m2, m3, m4, m5; uint32x2_t temp1, temp4; m0 = vmovl_u8(vld1_u8(t)); temp1 = vld1_lane_u32((const uint32_t *)(t + 8), temp1, 0); temp1 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp1, 1); m1 = vmovl_u8(vreinterpret_u8_u32(temp1)); m2 = vmovl_u8(vld1_u8(t + pitch * 2 + 4)); t += pitch * 4; m3 = vmovl_u8(vld1_u8(t)); temp4 = vld1_lane_u32((const uint32_t *)(t + 8), temp4, 0); temp4 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp4, 1); m4 = vmovl_u8(vreinterpret_u8_u32(temp4)); m5 = vmovl_u8(vld1_u8(t + pitch * 2 + 4)); vst1q_f32(p, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m0)))); vst1q_f32(p + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m0)))); vst1q_f32(p + 8, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m1)))); vst1q_f32(p + 12, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m1)))); vst1q_f32(p + 16, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m2)))); vst1q_f32(p + 20, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m2)))); vst1q_f32(p + 24, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m3)))); vst1q_f32(p + 28, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m3)))); vst1q_f32(p + 32, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m4)))); vst1q_f32(p + 36, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m4)))); vst1q_f32(p + 40, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m5)))); vst1q_f32(p + 44, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m5)))); }
static void ConvertBGRAToRGBA(const uint32_t* src, int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~1); const uint8x8_t shuffle = vld1_u8(kRGBAShuffle); for (; src < end; src += 2) { const uint8x8_t pixels = vld1_u8((uint8_t*)src); vst1_u8(dst, vtbl1_u8(pixels, shuffle)); dst += 8; } VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst); // left-overs }
unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { uint8x8_t v_s0 = vld1_u8(s); const uint8x8_t v_s1 = vld1_u8(s + p); uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); v_s0 = vld1_u8(s + 2 * p); v_sum = vaddw_u8(v_sum, v_s0); v_s0 = vld1_u8(s + 3 * p); v_sum = vaddw_u8(v_sum, v_s0); v_s0 = vld1_u8(s + 4 * p); v_sum = vaddw_u8(v_sum, v_s0); v_s0 = vld1_u8(s + 5 * p); v_sum = vaddw_u8(v_sum, v_s0); v_s0 = vld1_u8(s + 6 * p); v_sum = vaddw_u8(v_sum, v_s0); v_s0 = vld1_u8(s + 7 * p); v_sum = vaddw_u8(v_sum, v_s0); return (horizontal_add_u16x8(v_sum) + 32) >> 6; }
static INLINE uint8x8x4_t read_4x8(unsigned char *src, int pitch) { uint8x8x4_t x; const uint8x8_t a = vld1_u8(src); const uint8x8_t b = vld1_u8(src + pitch * 1); const uint8x8_t c = vld1_u8(src + pitch * 2); const uint8x8_t d = vld1_u8(src + pitch * 3); const uint8x8_t e = vld1_u8(src + pitch * 4); const uint8x8_t f = vld1_u8(src + pitch * 5); const uint8x8_t g = vld1_u8(src + pitch * 6); const uint8x8_t h = vld1_u8(src + pitch * 7); const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a), vreinterpret_u32_u8(e)); const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b), vreinterpret_u32_u8(f)); const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c), vreinterpret_u32_u8(g)); const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d), vreinterpret_u32_u8(h)); const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]), vreinterpret_u16_u32(r26_u32.val[0])); const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]), vreinterpret_u16_u32(r37_u32.val[0])); const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]), vreinterpret_u8_u16(r13_u16.val[0])); const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]), vreinterpret_u8_u16(r13_u16.val[1])); /* * after vtrn_u32 00 01 02 03 | 40 41 42 43 10 11 12 13 | 50 51 52 53 20 21 22 23 | 60 61 62 63 30 31 32 33 | 70 71 72 73 --- * after vtrn_u16 00 01 20 21 | 40 41 60 61 02 03 22 23 | 42 43 62 63 10 11 30 31 | 50 51 70 71 12 13 32 33 | 52 52 72 73 00 01 20 21 | 40 41 60 61 10 11 30 31 | 50 51 70 71 02 03 22 23 | 42 43 62 63 12 13 32 33 | 52 52 72 73 --- * after vtrn_u8 00 10 20 30 | 40 50 60 70 01 11 21 31 | 41 51 61 71 02 12 22 32 | 42 52 62 72 03 13 23 33 | 43 53 63 73 */ x.val[0] = r01_u8.val[0]; x.val[1] = r01_u8.val[1]; x.val[2] = r23_u8.val[0]; x.val[3] = r23_u8.val[1]; return x; }
static inline void char_to_float_vectors(const unsigned char * sourcep, float32x4_t *mp0, float32x4_t * mp1) { uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1} */ int16x8_t widerpixels; /* rawpixels promoted to shorts per component */ int16x4_t high16, low16; int32x4_t high32, low32; const int16x8_t uvbias = {0, 128, 0, 128, 0, 128, 0, 128}; rawpixels = vld1_u8(sourcep); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* subtract uvbias from widerpixels */ widerpixels = vsubq_s16(widerpixels, uvbias); /* now take widerpixels apart into (low16, high16) and */ /* then expand those into (low32, high32) */ low16 = vget_low_s16(widerpixels); high16 = vget_high_s16(widerpixels); high32 = vmovl_s16(high16); low32 = vmovl_s16(low16); /* now convert low32 and high32 into floats and store them in */ /* *mp0, *mp1 */ *mp0 = vcvtq_f32_s32(low32); *mp1 = vcvtq_f32_s32(high32); }
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t XABCD_u8 = vld1_u8(above - 1); const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); const uint32x2_t zero = vdup_n_u32(0); const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); const uint8_t D = vget_lane_u8(XABCD_u8, 4); const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); const uint32x2_t r3 = vreinterpret_u32_u8(avg2); const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); }
void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int i; uint8x8_t d0u8 = vdup_n_u8(0); (void)left; d0u8 = vld1_u8(above); for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8); }
void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 }; const uint8x8_t sh_12345677 = vld1_u8(shuffle1); const uint8x8_t sh_23456777 = vld1_u8(shuffle2); const uint8x8_t A0 = vld1_u8(above); // top row const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677); const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777); const uint8x8_t avg1 = vhadd_u8(A0, A2); uint8x8_t row = vrhadd_u8(avg1, A1); int i; (void)left; for (i = 0; i < 7; ++i) { vst1_u8(dst + i * stride, row); row = vtbl1_u8(row, sh_12345677); } vst1_u8(dst + i * stride, row); }
void byte2word64_neon(const uint8_t *t, const int pitch, float *pf) { uint16_t *p = (uint16_t *)pf; vst1q_u16(p, vmovl_u8(vld1_u8(t))); vst1q_u16(p + 8, vmovl_u8(vld1_u8(t + 8))); vst1q_u16(p + 16, vmovl_u8(vld1_u8(t + pitch * 2))); vst1q_u16(p + 24, vmovl_u8(vld1_u8(t + pitch * 2 + 8))); vst1q_u16(p + 32, vmovl_u8(vld1_u8(t + pitch * 4))); vst1q_u16(p + 40, vmovl_u8(vld1_u8(t + pitch * 4 + 8))); vst1q_u16(p + 48, vmovl_u8(vld1_u8(t + pitch * 6))); vst1q_u16(p + 56, vmovl_u8(vld1_u8(t + pitch * 6 + 8))); }
unsigned int vpx_get4x4sse_cs_neon( const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int recon_stride) { int16x4_t d22s16, d24s16, d26s16, d28s16; int64x1_t d0s64; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; int32x4_t q7s32, q8s32, q9s32, q10s32; uint16x8_t q11u16, q12u16, q13u16, q14u16; int64x2_t q1s64; d0u8 = vld1_u8(src_ptr); src_ptr += source_stride; d4u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d1u8 = vld1_u8(src_ptr); src_ptr += source_stride; d5u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d2u8 = vld1_u8(src_ptr); src_ptr += source_stride; d6u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; d3u8 = vld1_u8(src_ptr); src_ptr += source_stride; d7u8 = vld1_u8(ref_ptr); ref_ptr += recon_stride; q11u16 = vsubl_u8(d0u8, d4u8); q12u16 = vsubl_u8(d1u8, d5u8); q13u16 = vsubl_u8(d2u8, d6u8); q14u16 = vsubl_u8(d3u8, d7u8); d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); q7s32 = vmull_s16(d22s16, d22s16); q8s32 = vmull_s16(d24s16, d24s16); q9s32 = vmull_s16(d26s16, d26s16); q10s32 = vmull_s16(d28s16, d28s16); q7s32 = vaddq_s32(q7s32, q8s32); q9s32 = vaddq_s32(q9s32, q10s32); q9s32 = vaddq_s32(q7s32, q9s32); q1s64 = vpaddlq_s32(q9s32); d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x8_t A = vld1_u8(above); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_top = vcombine_u16(p2, p2); } if (do_left) { const uint8x8_t L = vld1_u8(left); // left border const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_left = vcombine_u16(p2, p2); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 4); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 3); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 3); } else { dc0 = vdup_n_u8(0x80); } { const uint8x8_t dc = vdup_lane_u8(dc0, 0); int i; for (i = 0; i < 8; ++i) { vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); } } }
// Process a block exactly 8 wide and any height. static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, const uint8_t *filter) { const uint8x8_t f0 = vdup_n_u8(filter[0]); const uint8x8_t f1 = vdup_n_u8(filter[1]); unsigned int i; for (i = 0; i < output_height; ++i) { const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); const uint16x8_t a = vmull_u8(src_0, f0); const uint16x8_t b = vmlal_u8(a, src_1, f1); const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); vst1_u8(output_ptr, out); src_ptr += src_pixels_per_line; output_ptr += 8; } }
void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { int j; uint16x8_t q0u16, q3u16, q10u16; int16x8_t q0s16; uint16x4_t d20u16; uint8x8_t d0u8, d2u8, d30u8; d0u8 = vld1_dup_u8(above - 1); d30u8 = vld1_u8(left); d2u8 = vld1_u8(above); q10u16 = vmovl_u8(d30u8); q3u16 = vsubl_u8(d2u8, d0u8); d20u16 = vget_low_u16(q10u16); for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { q0u16 = vdupq_lane_u16(d20u16, 0); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 1); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 2); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; q0u16 = vdupq_lane_u16(d20u16, 3); q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), vreinterpretq_s16_u16(q0u16)); d0u8 = vqmovun_s16(q0s16); vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); dst += stride; } }
void vp8_copy_mem8x8_neon(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) { uint8x8_t vtmp; int r; for (r = 0; r < 8; ++r) { vtmp = vld1_u8(src); vst1_u8(dst, vtmp); src += src_stride; dst += dst_stride; } }
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *output_ptr, unsigned int src_pixels_per_line, int pixel_step, unsigned int output_height, unsigned int output_width, const uint16_t *vpx_filter) { const uint8x8_t f0 = vmov_n_u8((uint8_t)vpx_filter[0]); const uint8x8_t f1 = vmov_n_u8((uint8_t)vpx_filter[1]); unsigned int i; for (i = 0; i < output_height; ++i) { const uint8x8_t src_0 = vld1_u8(&src_ptr[0]); const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]); const uint16x8_t a = vmull_u8(src_0, f0); const uint16x8_t b = vmlal_u8(a, src_1, f1); const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS); vst1_u8(&output_ptr[0], out); // Next row... src_ptr += src_pixels_per_line; output_ptr += output_width; } }
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); const uint8x8_t shuffle = vld1_u8(kGreenShuffle); for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); const uint8x16_t greens = vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle), vtbl1_u8(vget_high_u8(argb), shuffle)); vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens)); } // fallthrough and finish off with plain-C VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3); }
void byte2word48_neon(const uint8_t *t, const int pitch, float *pf) { uint16_t *p = (uint16_t *)pf; uint8x8_t m0, m1, m2, m3, m4, m5; m0 = vld1_u8(t); m1 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + 8), vreinterpret_u32_u8(m1), 0)); m1 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + pitch * 2), vreinterpret_u32_u8(m1), 1)); m2 = vld1_u8(t + pitch * 2 + 4); t += pitch * 4; m3 = vld1_u8(t); m4 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + 8), vreinterpret_u32_u8(m4), 0)); m4 = vreinterpret_u8_u32(vld1_lane_u32((const uint32_t *)(t + pitch * 2), vreinterpret_u32_u8(m4), 1)); m5 = vld1_u8(t + pitch * 2 + 4); vst1q_u16(p, vmovl_u8(m0)); vst1q_u16(p + 8, vmovl_u8(m1)); vst1q_u16(p + 16, vmovl_u8(m2)); vst1q_u16(p + 24, vmovl_u8(m3)); vst1q_u16(p + 32, vmovl_u8(m4)); vst1q_u16(p + 40, vmovl_u8(m5)); }
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) { const uint32_t* const end = argb_data + (num_pixels & ~3); #ifdef USE_VTBLQ const uint8x16_t shuffle = vld1q_u8(kGreenShuffle); #else const uint8x8_t shuffle = vld1_u8(kGreenShuffle); #endif for (; argb_data < end; argb_data += 4) { const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data); const uint8x16_t greens = DoGreenShuffle(argb, shuffle); vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens)); } // fallthrough and finish off with plain-C VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3); }
test_vdupb_lane_u8 () { uint8x8_t a; uint8_t b; uint8_t c[8] = { 0, 1, 2, 3, 4, 5, 6, 7 }; a = vld1_u8 (c); b = wrap_vdupb_lane_u8_0 (a, a); if (c[0] != b) return 1; b = wrap_vdupb_lane_u8_1 (a); if (c[1] != b) return 1; return 0; }
void vpx_lpf_horizontal_4_dual_neon( uint8_t *s, int p /* pitch */, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1) { uint8x8_t dblimit0, dlimit0, dthresh0, dblimit1, dlimit1, dthresh1; uint8x16_t qblimit, qlimit, qthresh; uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8; dblimit0 = vld1_u8(blimit0); dlimit0 = vld1_u8(limit0); dthresh0 = vld1_u8(thresh0); dblimit1 = vld1_u8(blimit1); dlimit1 = vld1_u8(limit1); dthresh1 = vld1_u8(thresh1); qblimit = vcombine_u8(dblimit0, dblimit1); qlimit = vcombine_u8(dlimit0, dlimit1); qthresh = vcombine_u8(dthresh0, dthresh1); s -= (p << 2); q3u8 = vld1q_u8(s); s += p; q4u8 = vld1q_u8(s); s += p; q5u8 = vld1q_u8(s); s += p; q6u8 = vld1q_u8(s); s += p; q7u8 = vld1q_u8(s); s += p; q8u8 = vld1q_u8(s); s += p; q9u8 = vld1q_u8(s); s += p; q10u8 = vld1q_u8(s); loop_filter_neon_16(qblimit, qlimit, qthresh, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, &q5u8, &q6u8, &q7u8, &q8u8); s -= (p * 5); vst1q_u8(s, q5u8); s += p; vst1q_u8(s, q6u8); s += p; vst1q_u8(s, q7u8); s += p; vst1q_u8(s, q8u8); return; }
static void ConvertBGRAToBGR(const uint32_t* src, int num_pixels, uint8_t* dst) { const uint32_t* const end = src + (num_pixels & ~7); const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]); const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]); const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]); for (; src < end; src += 8) { uint8x8x4_t pixels; INIT_VECTOR4(pixels, vld1_u8((const uint8_t*)(src + 0)), vld1_u8((const uint8_t*)(src + 2)), vld1_u8((const uint8_t*)(src + 4)), vld1_u8((const uint8_t*)(src + 6))); vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0)); vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1)); vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2)); dst += 8 * 3; } VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst); // left-overs }
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row const uint64x1_t A1 = vshr_n_u64(A0, 8); const uint64x1_t A2 = vshr_n_u64(A0, 16); const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); const uint32x2_t r0 = vreinterpret_u32_u8(avg2); const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); (void)left; vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); dst[3 * stride + 3] = above[7]; }
void aom_lpf_horizontal_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; uint8_t *s, *psrc; uint8x8_t dblimit, dlimit, dthresh; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; uint8x8_t d16u8, d17u8, d18u8; dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); psrc = src - (pitch << 2); for (i = 0; i < 1; i++) { s = psrc + i * 8; d3u8 = vld1_u8(s); s += pitch; d4u8 = vld1_u8(s); s += pitch; d5u8 = vld1_u8(s); s += pitch; d6u8 = vld1_u8(s); s += pitch; d7u8 = vld1_u8(s); s += pitch; d16u8 = vld1_u8(s); s += pitch; d17u8 = vld1_u8(s); s += pitch; d18u8 = vld1_u8(s); mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); s -= (pitch * 6); vst1_u8(s, d0u8); s += pitch; vst1_u8(s, d1u8); s += pitch; vst1_u8(s, d2u8); s += pitch; vst1_u8(s, d3u8); s += pitch; vst1_u8(s, d4u8); s += pitch; vst1_u8(s, d5u8); } return; }
void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh) { int i; uint8_t *s; uint8x8_t dblimit, dlimit, dthresh; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; uint8x8_t d16u8, d17u8, d18u8; uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3; uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7; uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11; uint8x8x4_t d4Result; uint8x8x2_t d2Result; dblimit = vld1_u8(blimit); dlimit = vld1_u8(limit); dthresh = vld1_u8(thresh); for (i = 0; i < 1; i++) { s = src + (i * (pitch << 3)) - 4; d3u8 = vld1_u8(s); s += pitch; d4u8 = vld1_u8(s); s += pitch; d5u8 = vld1_u8(s); s += pitch; d6u8 = vld1_u8(s); s += pitch; d7u8 = vld1_u8(s); s += pitch; d16u8 = vld1_u8(s); s += pitch; d17u8 = vld1_u8(s); s += pitch; d18u8 = vld1_u8(s); d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8)); d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8)); d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8)); d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8)); d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]), vreinterpret_u16_u32(d2tmp2.val[0])); d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]), vreinterpret_u16_u32(d2tmp3.val[0])); d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]), vreinterpret_u16_u32(d2tmp2.val[1])); d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]), vreinterpret_u16_u32(d2tmp3.val[1])); d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]), vreinterpret_u8_u16(d2tmp5.val[0])); d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]), vreinterpret_u8_u16(d2tmp5.val[1])); d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]), vreinterpret_u8_u16(d2tmp7.val[0])); d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]), vreinterpret_u8_u16(d2tmp7.val[1])); d3u8 = d2tmp8.val[0]; d4u8 = d2tmp8.val[1]; d5u8 = d2tmp9.val[0]; d6u8 = d2tmp9.val[1]; d7u8 = d2tmp10.val[0]; d16u8 = d2tmp10.val[1]; d17u8 = d2tmp11.val[0]; d18u8 = d2tmp11.val[1]; mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8, &d5u8); d4Result.val[0] = d0u8; d4Result.val[1] = d1u8; d4Result.val[2] = d2u8; d4Result.val[3] = d3u8; d2Result.val[0] = d4u8; d2Result.val[1] = d5u8; s = src - 3; vst4_lane_u8(s, d4Result, 0); s += pitch; vst4_lane_u8(s, d4Result, 1); s += pitch; vst4_lane_u8(s, d4Result, 2); s += pitch; vst4_lane_u8(s, d4Result, 3); s += pitch; vst4_lane_u8(s, d4Result, 4); s += pitch; vst4_lane_u8(s, d4Result, 5); s += pitch; vst4_lane_u8(s, d4Result, 6); s += pitch; vst4_lane_u8(s, d4Result, 7); s = src + 1; vst2_lane_u8(s, d2Result, 0); s += pitch; vst2_lane_u8(s, d2Result, 1); s += pitch; vst2_lane_u8(s, d2Result, 2); s += pitch; vst2_lane_u8(s, d2Result, 3); s += pitch; vst2_lane_u8(s, d2Result, 4); s += pitch; vst2_lane_u8(s, d2Result, 5); s += pitch; vst2_lane_u8(s, d2Result, 6); s += pitch; vst2_lane_u8(s, d2Result, 7); } return; }
int crypto_stream_xor( unsigned char *c, const unsigned char *m,unsigned long long mlen, const unsigned char *n, const unsigned char *k ) { const uint32x4_t abab = {-1,0,-1,0}; const uint64x1_t nextblock = {1}; uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k); uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16)); uint32x4_t start0 = (uint32x4_t) vld1q_u8((uint8_t *) sigma); uint32x2_t n0n1 = (uint32x2_t) vld1_u8((uint8_t *) n); uint32x2_t n2n3 = {0,0}; uint32x2_t k0k1 = vget_low_u32(k0k1k2k3); uint32x2_t k2k3 = vget_high_u32(k0k1k2k3); uint32x2_t k4k5 = vget_low_u32(k4k5k6k7); uint32x2_t k6k7 = vget_high_u32(k4k5k6k7); uint32x2_t n1n0 = vext_u32(n0n1,n0n1,1); uint32x2_t n3n2; uint32x2_t n0k4 = vext_u32(n1n0,k4k5,1); uint32x2_t k5k0 = vext_u32(k4k5,k0k1,1); uint32x2_t k1n1 = vext_u32(k0k1,n1n0,1); uint32x2_t n2k6; uint32x2_t k7k2 = vext_u32(k6k7,k2k3,1); uint32x2_t k3n3; uint32x4_t start1 = vcombine_u32(k5k0,n0k4); uint32x4_t start2; uint32x4_t start3; register uint32x4_t diag0; register uint32x4_t diag1; register uint32x4_t diag2; register uint32x4_t diag3; uint32x4_t next_start2; uint32x4_t next_start3; register uint32x4_t next_diag0; register uint32x4_t next_diag1; register uint32x4_t next_diag2; register uint32x4_t next_diag3; uint32x4_t x0x5x10x15; uint32x4_t x12x1x6x11; uint32x4_t x8x13x2x7; uint32x4_t x4x9x14x3; uint32x4_t x0x1x10x11; uint32x4_t x12x13x6x7; uint32x4_t x8x9x2x3; uint32x4_t x4x5x14x15; uint32x4_t x0x1x2x3; uint32x4_t x4x5x6x7; uint32x4_t x8x9x10x11; uint32x4_t x12x13x14x15; uint32x4_t m0m1m2m3; uint32x4_t m4m5m6m7; uint32x4_t m8m9m10m11; uint32x4_t m12m13m14m15; register uint32x4_t a0; register uint32x4_t a1; register uint32x4_t a2; register uint32x4_t a3; register uint32x4_t b0; register uint32x4_t b1; register uint32x4_t b2; register uint32x4_t b3; register uint32x4_t next_a0; register uint32x4_t next_a1; register uint32x4_t next_a2; register uint32x4_t next_a3; register uint32x4_t next_b0; register uint32x4_t next_b1; register uint32x4_t next_b2; register uint32x4_t next_b3; unsigned char block[64]; unsigned char *savec; int i; int flagm = (m != 0); if (!mlen) return 0; if (mlen < 128) goto mlenatleast1; mlenatleast128: n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); start2 = vcombine_u32(n2k6,k1n1); start3 = vcombine_u32(k3n3,k7k2); n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); diag0 = start0; diag1 = start1; diag2 = start2; diag3 = start3; n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); next_start2 = vcombine_u32(n2k6,k1n1); next_start3 = vcombine_u32(k3n3,k7k2); n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); next_diag0 = start0; next_diag1 = start1; next_diag2 = next_start2; next_diag3 = next_start3; for (i = ROUNDS;i > 0;i -= 2) { a0 = diag1 + diag0; b0 = vshlq_n_u32(a0,7); next_a0 = next_diag1 + next_diag0; a0 = vsriq_n_u32(b0,a0,25); next_b0 = vshlq_n_u32(next_a0,7); diag3 ^= a0; next_a0 = vsriq_n_u32(next_b0,next_a0,25); a1 = diag0 + diag3; next_diag3 ^= next_a0; b1 = vshlq_n_u32(a1,9); next_a1 = next_diag0 + next_diag3; a1 = vsriq_n_u32(b1,a1,23); next_b1 = vshlq_n_u32(next_a1,9); diag2 ^= a1; next_a1 = vsriq_n_u32(next_b1,next_a1,23); a2 = diag3 + diag2; diag3 = vextq_u32(diag3,diag3,3); next_diag2 ^= next_a1; b2 = vshlq_n_u32(a2,13); next_a2 = next_diag3 + next_diag2; next_diag3 = vextq_u32(next_diag3,next_diag3,3); a2 = vsriq_n_u32(b2,a2,19); next_b2 = vshlq_n_u32(next_a2,13); diag1 ^= a2; next_a2 = vsriq_n_u32(next_b2,next_a2,19); a3 = diag2 + diag1; diag2 = vextq_u32(diag2,diag2,2); next_diag1 ^= next_a2; b3 = vshlq_n_u32(a3,18); diag1 = vextq_u32(diag1,diag1,1); next_a3 = next_diag2 + next_diag1; next_diag2 = vextq_u32(next_diag2,next_diag2,2); a3 = vsriq_n_u32(b3,a3,14); next_b3 = vshlq_n_u32(next_a3,18); next_diag1 = vextq_u32(next_diag1,next_diag1,1); diag0 ^= a3; next_a3 = vsriq_n_u32(next_b3,next_a3,14); a0 = diag3 + diag0; next_diag0 ^= next_a3; b0 = vshlq_n_u32(a0,7); next_a0 = next_diag3 + next_diag0; a0 = vsriq_n_u32(b0,a0,25); next_b0 = vshlq_n_u32(next_a0,7); diag1 ^= a0; next_a0 = vsriq_n_u32(next_b0,next_a0,25); a1 = diag0 + diag1; next_diag1 ^= next_a0; b1 = vshlq_n_u32(a1,9); next_a1 = next_diag0 + next_diag1; a1 = vsriq_n_u32(b1,a1,23); next_b1 = vshlq_n_u32(next_a1,9); diag2 ^= a1; next_a1 = vsriq_n_u32(next_b1,next_a1,23); a2 = diag1 + diag2; diag1 = vextq_u32(diag1,diag1,3); next_diag2 ^= next_a1; b2 = vshlq_n_u32(a2,13); next_a2 = next_diag1 + next_diag2; next_diag1 = vextq_u32(next_diag1,next_diag1,3); a2 = vsriq_n_u32(b2,a2,19); next_b2 = vshlq_n_u32(next_a2,13); diag3 ^= a2; next_a2 = vsriq_n_u32(next_b2,next_a2,19); a3 = diag2 + diag3; diag2 = vextq_u32(diag2,diag2,2); next_diag3 ^= next_a2; b3 = vshlq_n_u32(a3,18); diag3 = vextq_u32(diag3,diag3,1); next_a3 = next_diag2 + next_diag3; next_diag2 = vextq_u32(next_diag2,next_diag2,2); a3 = vsriq_n_u32(b3,a3,14); next_b3 = vshlq_n_u32(next_a3,18); next_diag3 = vextq_u32(next_diag3,next_diag3,1); diag0 ^= a3; next_a3 = vsriq_n_u32(next_b3,next_a3,14); next_diag0 ^= next_a3; } x0x5x10x15 = diag0 + start0; x12x1x6x11 = diag1 + start1; x8x13x2x7 = diag2 + start2; x4x9x14x3 = diag3 + start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); x0x5x10x15 = next_diag0 + start0; x12x1x6x11 = next_diag1 + start1; x8x13x2x7 = next_diag2 + next_start2; x4x9x14x3 = next_diag3 + next_start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8(64 + (uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(80 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(96 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(112 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8(64 + (uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(80 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(96 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(112 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); mlen -= 128; c += 128; if (flagm) m += 128; if (mlen >= 128) goto mlenatleast128; mlenatleast1: if (mlen < 64) { if (flagm) for (i = 0;i < 64;++i) block[i] = 0; if (flagm) for (i = 0;i < mlen;++i) block[i] = m[i]; savec = c; c = block; if (flagm) m = block; } n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); start2 = vcombine_u32(n2k6,k1n1); start3 = vcombine_u32(k3n3,k7k2); diag0 = start0; diag1 = start1; diag2 = start2; diag3 = start3; for (i = ROUNDS;i > 0;i -= 2) { a0 = diag1 + diag0; b0 = vshlq_n_u32(a0,7); a0 = vsriq_n_u32(b0,a0,25); diag3 ^= a0; a1 = diag0 + diag3; b1 = vshlq_n_u32(a1,9); a1 = vsriq_n_u32(b1,a1,23); diag2 ^= a1; a2 = diag3 + diag2; diag3 = vextq_u32(diag3,diag3,3); b2 = vshlq_n_u32(a2,13); a2 = vsriq_n_u32(b2,a2,19); diag1 ^= a2; a3 = diag2 + diag1; diag2 = vextq_u32(diag2,diag2,2); b3 = vshlq_n_u32(a3,18); diag1 = vextq_u32(diag1,diag1,1); a3 = vsriq_n_u32(b3,a3,14); diag0 ^= a3; a0 = diag3 + diag0; b0 = vshlq_n_u32(a0,7); a0 = vsriq_n_u32(b0,a0,25); diag1 ^= a0; a1 = diag0 + diag1; b1 = vshlq_n_u32(a1,9); a1 = vsriq_n_u32(b1,a1,23); diag2 ^= a1; a2 = diag1 + diag2; diag1 = vextq_u32(diag1,diag1,3); b2 = vshlq_n_u32(a2,13); a2 = vsriq_n_u32(b2,a2,19); diag3 ^= a2; a3 = diag2 + diag3; diag2 = vextq_u32(diag2,diag2,2); b3 = vshlq_n_u32(a3,18); diag3 = vextq_u32(diag3,diag3,1); a3 = vsriq_n_u32(b3,a3,14); diag0 ^= a3; } x0x5x10x15 = diag0 + start0; x12x1x6x11 = diag1 + start1; x8x13x2x7 = diag2 + start2; x4x9x14x3 = diag3 + start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); if (mlen < 64) { for (i = 0;i < mlen;++i) savec[i] = c[i]; } if (mlen <= 64) return 0; n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); mlen -= 64; c += 64; if (flagm) m += 64; goto mlenatleast1; }
void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count, const int* xofs, const short* alpha, int swidth, int dwidth, int cn, int xmin, int xmax) { int dx, k; int dx0 = 0; int16x4x2_t alpha_vec; uint8x8_t dS0_vec, dS1_vec; int16x8_t qS0_vec, qS1_vec; int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567; int32x4_t qT0_vec, qT1_vec; int16x4_t dCoeff; dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE); for (k = 0; k <= count - 2; k++) { const unsigned char *S0 = src[k], *S1 = src[k + 1]; int *D0 = dst[k], *D1 = dst[k + 1]; for (dx = dx0; dx < xmax; dx += 4) { int sx = xofs[dx]; alpha_vec = vld2_s16 (&alpha[dx * 2]); dS0_vec = vld1_u8 (&S0[sx]); dS1_vec = vld1_u8 (&S1[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS0_4567 = vget_high_s16 (qS0_vec); dS1_0123 = vget_low_s16 (qS1_vec); dS1_4567 = vget_high_s16 (qS1_vec); qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]); qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]); qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]); qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]); vst1q_s32 (&D0[dx], qT0_vec); vst1q_s32 (&D1[dx], qT1_vec); } for (; dx < dwidth; dx += 4) { int sx = xofs[dx]; dS0_vec = vld1_u8 (&S0[sx]); dS1_vec = vld1_u8 (&S1[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS1_0123 = vget_low_s16 (qS1_vec); qT0_vec = vmull_s16 (dS0_0123, dCoeff); qT1_vec = vmull_s16 (dS1_0123, dCoeff); vst1q_s32 (&D0[dx], qT0_vec); vst1q_s32 (&D1[dx], qT1_vec); } } for (; k < count; k++) { const unsigned char *S = src[k]; int *D = dst[k]; for (dx = 0; dx < xmax; dx += 4) { int sx = xofs[dx]; alpha_vec = vld2_s16 (&alpha[dx * 2]); dS0_vec = vld1_u8 (&S[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS0_4567 = vget_high_s16 (qS0_vec); qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]); qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]); vst1q_s32 (&D[dx], qT0_vec); } for (; dx < dwidth; dx += 4) { int sx = xofs[dx]; dS0_vec = vld1_u8 (&S[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); dS0_0123 = vget_low_s16 (qS0_vec); qT0_vec = vmull_s16 (dS0_0123, dCoeff); vst1q_s32 (&D[dx], qT0_vec); } } }
void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width) { const int *S0 = src[0], *S1 = src[1]; int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567; int32x4_t qT_0123, qT_4567; int16x4_t dT_0123, dT_4567; uint16x8_t qT_01234567; uint8x8_t dT_01234567, dDst_01234567; int32x2_t dBeta; dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0); dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1); int32x4_t qDelta, qMin, qMax; qDelta = vdupq_n_s32 (DELTA); qMin = vdupq_n_s32 (0); qMax = vdupq_n_s32 (255); int x = 0; for (; x <= width - 8; x += 8) { qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); vst1_u8 (&dst[x], dT_01234567); } if (x < width) { uint8x8_t dMask; dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)])); dDst_01234567 = vld1_u8 (&dst[x]); qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567); vst1_u8 (&dst[x], dMask); } }