/* u16x8 mv mul */ void mw_neon_mv_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, unsigned short * C) { int i = 0; int k = 0; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { neon_c = vmovq_n_u16(0); for (k = 0; k < T; k+=8) { int j = k * T + i; neon_a0 = vld1q_u16(A + j); j+=Row; neon_a1 = vld1q_u16(A + j); j+=Row; neon_a2 = vld1q_u16(A + j); j+=Row; neon_a3 = vld1q_u16(A + j); j+=Row; neon_a4 = vld1q_u16(A + j); j+=Row; neon_a5 = vld1q_u16(A + j); j+=Row; neon_a6 = vld1q_u16(A + j); j+=Row; neon_a7 = vld1q_u16(A + j); neon_b = vld1q_u16(B + k); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); } vst1q_u16(C + i, neon_c); } }
template <> SIMD_INLINE uint16x8_t ReduceColTail<false>(const uint8_t *src) { const uint8x8x2_t t01 = vld2_u8(src - 1); const uint8x8x2_t t23 = Deinterleave(LoadAfterLast<1>(LoadAfterLast<1>(vld1q_u8(src - 1)))); return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); }
SIMD_INLINE uint16x8_t ReduceColBody(const uint8_t *src) { const uint8x8x2_t t01 = vld2_u8(src - 1); const uint8x8x2_t t23 = vld2_u8(src + 1); return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); }
SIMD_INLINE uint16x8_t ReduceColNose(const uint8_t * src) { const uint8x8x2_t t01 = Deinterleave(LoadBeforeFirst<1>(vld1q_u8(src))); const uint8x8x2_t t23 = vld2_u8(src + 1); return vaddq_u16(vaddl_u8(t01.val[0], t23.val[1]), vmulq_u16(vaddl_u8(t01.val[1], t23.val[0]), K16_0003)); }
/* u16x8 mm mul */ void mw_neon_mm_mul_u16x8(unsigned short * A, int Row, int T, unsigned short * B, int Col, unsigned short * C) { int i, k, j; uint16x8_t neon_b, neon_c; uint16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; uint16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_u16(0); for (j = 0; j < T; j+=8) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_u16(A + j_T); j_T+=Row; neon_a1 = vld1q_u16(A + j_T); j_T+=Row; neon_a2 = vld1q_u16(A + j_T); j_T+=Row; neon_a3 = vld1q_u16(A + j_T); j_T+=Row; neon_a4 = vld1q_u16(A + j_T); j_T+=Row; neon_a5 = vld1q_u16(A + j_T); j_T+=Row; neon_a6 = vld1q_u16(A + j_T); j_T+=Row; neon_a7 = vld1q_u16(A + j_T); neon_b = vld1q_u16(B + k_Row + j); neon_b0 = vdupq_n_u16(vgetq_lane_u16(neon_b, 0)); neon_b1 = vdupq_n_u16(vgetq_lane_u16(neon_b, 1)); neon_b2 = vdupq_n_u16(vgetq_lane_u16(neon_b, 2)); neon_b3 = vdupq_n_u16(vgetq_lane_u16(neon_b, 3)); neon_b4 = vdupq_n_u16(vgetq_lane_u16(neon_b, 4)); neon_b5 = vdupq_n_u16(vgetq_lane_u16(neon_b, 5)); neon_b6 = vdupq_n_u16(vgetq_lane_u16(neon_b, 6)); neon_b7 = vdupq_n_u16(vgetq_lane_u16(neon_b, 7)); neon_c = vaddq_u16(vmulq_u16(neon_a0, neon_b0), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a1, neon_b1), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a2, neon_b2), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a3, neon_b3), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a4, neon_b4), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a5, neon_b5), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a6, neon_b6), neon_c); neon_c = vaddq_u16(vmulq_u16(neon_a7, neon_b7), neon_c); vst1q_lane_u16(C + k_Row + i, neon_c, 0); vst1q_lane_u16(C + k_Row + i + 1, neon_c, 1); vst1q_lane_u16(C + k_Row + i + 2, neon_c, 2); vst1q_lane_u16(C + k_Row + i + 3, neon_c, 3); vst1q_lane_u16(C + k_Row + i + 4, neon_c, 4); vst1q_lane_u16(C + k_Row + i + 5, neon_c, 5); vst1q_lane_u16(C + k_Row + i + 6, neon_c, 6); vst1q_lane_u16(C + k_Row + i + 7, neon_c, 7); } } } }
int neon_new(DATA32* _p0, DATA32* _p1, DATA32* _p2, DATA32* _p3, DATA32* _ax, DATA32 _ay, DATA32* result, int len) { int ay = _ay; int i; DATA32* pbuf = result; uint16x4_t ay_16x4; uint16x4_t p0_16x4; uint16x4_t p2_16x4; uint16x8_t ax_16x8; uint16x8_t p0_p2_16x8; uint16x8_t p1_p3_16x8; uint16x8_t x255_16x8; uint32x2_t p0_p2_32x2; uint32x2_t p1_p3_32x2; uint32x2_t res_32x2; uint8x8_t p0_p2_8x8; uint8x8_t p1_p3_8x8; uint8x8_t p2_8x8; uint16x4_t temp_16x4; ay_16x4 = vdup_n_u16(ay); x255_16x8 = vdupq_n_u16(0xff); for(i = 0; i < len; i++) { DATA32 p0 = *_p0++; DATA32 p1 = *_p1++; DATA32 p2 = *_p2++; DATA32 p3 = *_p3++; int ax = *_ax++; if (p0 | p1 | p2 | p3) { ax_16x8 = vdupq_n_u16(ax); p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0); p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1); p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0); p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1); p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2); p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2); p1_p3_16x8 = vmovl_u8(p1_p3_8x8); p0_p2_16x8 = vmovl_u8(p0_p2_8x8); p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8); p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8); p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8); p0_16x4 = vget_low_u16(p1_p3_16x8); p2_16x4 = vget_high_u16(p1_p3_16x8); p2_16x4 = vsub_u16(p2_16x4, p0_16x4); p2_16x4 = vmul_u16(p2_16x4, ay_16x4); p2_16x4 = vshr_n_u16(p2_16x4, 8); p2_16x4 = vadd_u16(p2_16x4, p0_16x4); p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4); p2_8x8 = vmovn_u16(p1_p3_16x8); res_32x2 = vreinterpret_u32_u8(p2_8x8); vst1_lane_u32(pbuf++, res_32x2, 1); } else *pbuf++ = p0; } return 0; }