void matMult(int16_t mat1[], int16_t mat2[], int32_t prod[matrix_size][matrix_size]) { int output_size = 2 * matrix_size; int l,k; int16x4_t data1; int32x4_t mac_output[output_size/4]; int32x4_t MAC_addvalue[output_size/4]; int16x4_t constant_value; unsigned int index_input = 0; unsigned int transfer_index = 0 ; int32_t *pres_ver; /* Allocate output */ pres_ver = malloc(output_size * output_size * sizeof(int32_t)); for(l = 0 ; l < matrix_size/4; l++) { MAC_addvalue[l] = vmovq_n_s32(0); } /* Perform the multiplication */ for(l = 0; l < matrix_size*matrix_size; l++) { constant_value = vmov_n_s16 (mat1[l]); for(k = 0 ; k < matrix_size/4 ; k++) { data1 = vld1_s16 (&mat2[index_input]); MAC4 (&MAC_addvalue[k], &constant_value, &data1,&mac_output[k]); MAC_addvalue[k] = mac_output[k]; index_input +=4; } index_input+=output_size-matrix_size; if ((l + 1) % matrix_size == 0 ) { index_input = 0; for(k = 0 ; k < matrix_size/4 ; k++) { vst1q_s32(&pres_ver[transfer_index],MAC_addvalue[k]); transfer_index +=4; } transfer_index += output_size-matrix_size; for(k = 0 ; k < matrix_size/4; k++) { MAC_addvalue[k] = vmovq_n_s32(0); } } } }
void test_vmovQ_ns32 (void) { int32x4_t out_int32x4_t; int32_t arg0_int32_t; out_int32x4_t = vmovq_n_s32 (arg0_int32_t); }
/* s32x4 mm mul */ void mw_neon_mm_mul_s32x4(int * A, int Row, int T, int * B, int Col, int * C) { int i, k, j; int32x4_t neon_b, neon_c; int32x4_t neon_a0, neon_a1, neon_a2, neon_a3; int32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_s32(0); for (j = 0; j < T; j+=4) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_s32(A + j_T); j_T+=Row; neon_a1 = vld1q_s32(A + j_T); j_T+=Row; neon_a2 = vld1q_s32(A + j_T); j_T+=Row; neon_a3 = vld1q_s32(A + j_T); neon_b = vld1q_s32(B + k_Row + j); neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0)); neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1)); neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2)); neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3)); neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c); vst1q_lane_s32(C + k_Row + i, neon_c, 0); vst1q_lane_s32(C + k_Row + i + 1, neon_c, 1); vst1q_lane_s32(C + k_Row + i + 2, neon_c, 2); vst1q_lane_s32(C + k_Row + i + 3, neon_c, 3); } } } }
/* s32x4 mv mul */ void mw_neon_mv_mul_s32x4(int * A, int Row, int T, int * B, int * C) { int i = 0; int k = 0; int32x4_t neon_b, neon_c; int32x4_t neon_a0, neon_a1, neon_a2, neon_a3; int32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_s32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_s32(A + j); j+=Row; neon_a1 = vld1q_s32(A + j); j+=Row; neon_a2 = vld1q_s32(A + j); j+=Row; neon_a3 = vld1q_s32(A + j); neon_b = vld1q_s32(B + k); neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0)); neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1)); neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2)); neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3)); neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c); } vst1q_s32(C + i, neon_c); } }
int32x4_t test_vmovq_n_s32(int32_t v1) { // CHECK: test_vmovq_n_s32 return vmovq_n_s32(v1); // CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}} }
static void PCorr2Q32(const int16_t *in, int32_t *logcorQ8) { int16_t scaling,n,k; int32_t ysum32,csum32, lys, lcs; int32_t oneQ8; const int16_t *x, *inptr; oneQ8 = WEBRTC_SPL_LSHIFT_W32((int32_t)1, 8); // 1.00 in Q8 x = in + PITCH_MAX_LAG/2 + 2; scaling = WebRtcSpl_GetScalingSquare ((int16_t *) in, PITCH_CORR_LEN2, PITCH_CORR_LEN2); ysum32 = 1; csum32 = 0; x = in + PITCH_MAX_LAG/2 + 2; for (n = 0; n < PITCH_CORR_LEN2; n++) { ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[n],(int16_t) in[n], scaling); // Q0 csum32 += WEBRTC_SPL_MUL_16_16_RSFT((int16_t) x[n],(int16_t) in[n], scaling); // Q0 } logcorQ8 += PITCH_LAG_SPAN2 - 1; lys=Log2Q8((uint32_t) ysum32); // Q8 lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum); if (csum32>0) { lcs=Log2Q8((uint32_t) csum32); // 2log(csum) in Q8 if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2 in Q8 *logcorQ8 = lcs - lys; // log2(csum/sqrt(ysum)) } else { *logcorQ8 = oneQ8; // 1.00 } } else { *logcorQ8 = 0; } for (k = 1; k < PITCH_LAG_SPAN2; k++) { inptr = &in[k]; ysum32 -= WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[k-1],(int16_t) in[k-1], scaling); ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[PITCH_CORR_LEN2 + k - 1],(int16_t) in[PITCH_CORR_LEN2 + k - 1], scaling); #ifdef WEBRTC_ARCH_ARM_NEON { int32_t vbuff[4]; int32x4_t int_32x4_sum = vmovq_n_s32(0); // Can't shift a Neon register to right with a non-constant shift value. int32x4_t int_32x4_scale = vdupq_n_s32(-scaling); // Assert a codition used in loop unrolling at compile-time. COMPILE_ASSERT(PITCH_CORR_LEN2 %4 == 0); for (n = 0; n < PITCH_CORR_LEN2; n += 4) { int16x4_t int_16x4_x = vld1_s16(&x[n]); int16x4_t int_16x4_in = vld1_s16(&inptr[n]); int32x4_t int_32x4 = vmull_s16(int_16x4_x, int_16x4_in); int_32x4 = vshlq_s32(int_32x4, int_32x4_scale); int_32x4_sum = vaddq_s32(int_32x4_sum, int_32x4); } // Use vector store to avoid long stall from data trasferring // from vector to general register. vst1q_s32(vbuff, int_32x4_sum); csum32 = vbuff[0] + vbuff[1]; csum32 += vbuff[2]; csum32 += vbuff[3]; } #else csum32 = 0; if(scaling == 0) { for (n = 0; n < PITCH_CORR_LEN2; n++) { csum32 += x[n] * inptr[n]; } } else { for (n = 0; n < PITCH_CORR_LEN2; n++) { csum32 += (x[n] * inptr[n]) >> scaling; } } #endif logcorQ8--; lys=Log2Q8((uint32_t)ysum32); // Q8 lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum); if (csum32>0) { lcs=Log2Q8((uint32_t) csum32); // 2log(csum) in Q8 if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2 *logcorQ8 = lcs - lys; // log2(csum/sqrt(ysum)) } else { *logcorQ8 = oneQ8; // 1.00 } } else { *logcorQ8 = 0; } } }
f64 dotProduct(const Size2D &_size, const s8 * src0Base, ptrdiff_t src0Stride, const s8 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width)) { size.width *= size.height; size.height = 1; } // It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow // We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements #define DOT_INT_BLOCKSIZE 131070*8 f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; int64x2_t ws = vmovq_n_s64(0); while(i + 16 <= size.width) { size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16; int32x4_t s1 = vmovq_n_s32(0); int32x4_t s2 = vmovq_n_s32(0); for (; i <= lim; i += 16) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); int8x16_t vs1 = vld1q_s8(src0 + i); int8x16_t vs2 = vld1q_s8(src1 + i); int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2)); int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2)); s1 = vpadalq_s16(s1, vdot1); s2 = vpadalq_s16(s2, vdot2); } ws = vpadalq_s32(ws, s1); ws = vpadalq_s32(ws, s2); } if(i + 8 <= size.width) { int8x8_t vs1 = vld1_s8(src0 + i); int8x8_t vs2 = vld1_s8(src1 + i); ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2))); i += 8; } result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0); for (; i < size.width; ++i) result += s32(src0[i]) * s32(src1[i]); } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }