/* s32x4 mm mul */ void mw_neon_mm_mul_s32x4(int * A, int Row, int T, int * B, int Col, int * C) { int i, k, j; int32x4_t neon_b, neon_c; int32x4_t neon_a0, neon_a1, neon_a2, neon_a3; int32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_s32(0); for (j = 0; j < T; j+=4) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_s32(A + j_T); j_T+=Row; neon_a1 = vld1q_s32(A + j_T); j_T+=Row; neon_a2 = vld1q_s32(A + j_T); j_T+=Row; neon_a3 = vld1q_s32(A + j_T); neon_b = vld1q_s32(B + k_Row + j); neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0)); neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1)); neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2)); neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3)); neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c); vst1q_lane_s32(C + k_Row + i, neon_c, 0); vst1q_lane_s32(C + k_Row + i + 1, neon_c, 1); vst1q_lane_s32(C + k_Row + i + 2, neon_c, 2); vst1q_lane_s32(C + k_Row + i + 3, neon_c, 3); } } } }
void test_vmulQs32 (void) { int32x4_t out_int32x4_t; int32x4_t arg0_int32x4_t; int32x4_t arg1_int32x4_t; out_int32x4_t = vmulq_s32 (arg0_int32x4_t, arg1_int32x4_t); }
/* s32x4 mv mul */ void mw_neon_mv_mul_s32x4(int * A, int Row, int T, int * B, int * C) { int i = 0; int k = 0; int32x4_t neon_b, neon_c; int32x4_t neon_a0, neon_a1, neon_a2, neon_a3; int32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_s32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_s32(A + j); j+=Row; neon_a1 = vld1q_s32(A + j); j+=Row; neon_a2 = vld1q_s32(A + j); j+=Row; neon_a3 = vld1q_s32(A + j); neon_b = vld1q_s32(B + k); neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0)); neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1)); neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2)); neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3)); neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c); } vst1q_s32(C + i, neon_c); } }
static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, int16x4_t *d5s16, int16x8_t *q3s16, int16x8_t *q8s16, int16x8_t *q9s16) { int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; d6s16 = vget_low_s16(*q3s16); d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); q10s32 = vmull_s16(*d3s16, d16s16); q11s32 = vmull_s16(*d4s16, d16s16); q12s32 = vmull_s16(d6s16, d17s16); q13s32 = vmull_s16(*d5s16, d18s16); q14s32 = vmull_s16(*d3s16, d18s16); q15s32 = vmovl_s16(d16s16); q15s32 = vaddw_s16(q15s32, d19s16); q8s32 = vmull_s16(*d4s16, d19s16); q15s32 = vsubw_s16(q15s32, d18s16); q9s32 = vmull_s16(*d5s16, d19s16); q10s32 = vaddq_s32(q10s32, q13s32); q10s32 = vaddq_s32(q10s32, q8s32); q11s32 = vsubq_s32(q11s32, q14s32); q8s32 = vdupq_n_s32(sinpi_3_9); q11s32 = vsubq_s32(q11s32, q9s32); q15s32 = vmulq_s32(q15s32, q8s32); q13s32 = vaddq_s32(q10s32, q12s32); q10s32 = vaddq_s32(q10s32, q11s32); q14s32 = vaddq_s32(q11s32, q12s32); q10s32 = vsubq_s32(q10s32, q12s32); d16s16 = vqrshrn_n_s32(q13s32, 14); d17s16 = vqrshrn_n_s32(q14s32, 14); d18s16 = vqrshrn_n_s32(q15s32, 14); d19s16 = vqrshrn_n_s32(q10s32, 14); *q8s16 = vcombine_s16(d16s16, d17s16); *q9s16 = vcombine_s16(d18s16, d19s16); return; }