void matMult(int16_t mat1[], int16_t mat2[], int32_t prod[matrix_size][matrix_size])
{
	int output_size = 2 * matrix_size;
	int l,k;
	int16x4_t data1;
	int32x4_t mac_output[output_size/4];
	int32x4_t MAC_addvalue[output_size/4];
	int16x4_t constant_value;
	unsigned int index_input = 0;
	unsigned int transfer_index = 0 ;
	int32_t *pres_ver;

	/* Allocate output */
	pres_ver = malloc(output_size * output_size * sizeof(int32_t));
	for(l = 0 ; l < matrix_size/4; l++)
	{
	    MAC_addvalue[l] = vmovq_n_s32(0);
	}

	/* Perform the multiplication */
	for(l = 0; l < matrix_size*matrix_size; l++)
	{
	    constant_value = vmov_n_s16 (mat1[l]);
	    for(k = 0 ; k < matrix_size/4 ; k++)
	    {
	        data1 = vld1_s16 (&mat2[index_input]);
	        MAC4 (&MAC_addvalue[k], &constant_value, &data1,&mac_output[k]);
	        MAC_addvalue[k] = mac_output[k];
	        index_input +=4;
	    }

		index_input+=output_size-matrix_size;
	    if ((l + 1) % matrix_size == 0 )
	    {
	        index_input = 0;

	        for(k = 0 ; k < matrix_size/4 ; k++)
	        {
	            vst1q_s32(&pres_ver[transfer_index],MAC_addvalue[k]);
	            transfer_index +=4;
	        }

			transfer_index += output_size-matrix_size;
	        for(k = 0 ; k < matrix_size/4; k++)
	        {
	            MAC_addvalue[k] = vmovq_n_s32(0);
	        }
	    }
	}
}
Esempio n. 2
0
void test_vmovQ_ns32 (void)
{
  int32x4_t out_int32x4_t;
  int32_t arg0_int32_t;

  out_int32x4_t = vmovq_n_s32 (arg0_int32_t);
}
Esempio n. 3
0
/* s32x4 mm mul */
void mw_neon_mm_mul_s32x4(int * A, int Row, int T, int * B, int Col, int * C)
{
	int i, k, j;

	int32x4_t neon_b, neon_c;
	int32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	int32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_s32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_s32(A + j_T);

				neon_b = vld1q_s32(B + k_Row + j);
				neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0));
				neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1));
				neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2));
				neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3));

				neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c);

				vst1q_lane_s32(C + k_Row + i, neon_c, 0);
				vst1q_lane_s32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_s32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_s32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}
Esempio n. 4
0
/* s32x4 mv mul */
void mw_neon_mv_mul_s32x4(int * A, int Row, int T, int * B, int * C)
{
	int i = 0;
	int k = 0;

	int32x4_t neon_b, neon_c;
	int32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	int32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_s32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_s32(A + j);
			j+=Row;
			neon_a1 = vld1q_s32(A + j);
			j+=Row;
			neon_a2 = vld1q_s32(A + j);
			j+=Row;
			neon_a3 = vld1q_s32(A + j);

			neon_b = vld1q_s32(B + k);
			neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0));
			neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1));
			neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2));
			neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3));

			neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c);

		}

		vst1q_s32(C + i, neon_c);
	}
}
Esempio n. 5
0
int32x4_t test_vmovq_n_s32(int32_t v1) {
  // CHECK: test_vmovq_n_s32
  return vmovq_n_s32(v1);
  // CHECK: dup {{v[0-9]+}}.4s, {{w[0-9]+}}
}
Esempio n. 6
0
static void PCorr2Q32(const int16_t *in, int32_t *logcorQ8)
{
  int16_t scaling,n,k;
  int32_t ysum32,csum32, lys, lcs;
  int32_t oneQ8;


  const int16_t *x, *inptr;

  oneQ8 = WEBRTC_SPL_LSHIFT_W32((int32_t)1, 8);  // 1.00 in Q8

  x = in + PITCH_MAX_LAG/2 + 2;
  scaling = WebRtcSpl_GetScalingSquare ((int16_t *) in, PITCH_CORR_LEN2, PITCH_CORR_LEN2);
  ysum32 = 1;
  csum32 = 0;
  x = in + PITCH_MAX_LAG/2 + 2;
  for (n = 0; n < PITCH_CORR_LEN2; n++) {
    ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[n],(int16_t) in[n], scaling);  // Q0
    csum32 += WEBRTC_SPL_MUL_16_16_RSFT((int16_t) x[n],(int16_t) in[n], scaling); // Q0
  }

  logcorQ8 += PITCH_LAG_SPAN2 - 1;

  lys=Log2Q8((uint32_t) ysum32); // Q8
  lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum);

  if (csum32>0) {

    lcs=Log2Q8((uint32_t) csum32);   // 2log(csum) in Q8

    if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2 in Q8
      *logcorQ8 = lcs - lys;  // log2(csum/sqrt(ysum))
    } else {
      *logcorQ8 = oneQ8;  // 1.00
    }

  } else {
    *logcorQ8 = 0;
  }


  for (k = 1; k < PITCH_LAG_SPAN2; k++) {
    inptr = &in[k];
    ysum32 -= WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[k-1],(int16_t) in[k-1], scaling);
    ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[PITCH_CORR_LEN2 + k - 1],(int16_t) in[PITCH_CORR_LEN2 + k - 1], scaling);

#ifdef WEBRTC_ARCH_ARM_NEON
    {
      int32_t vbuff[4];
      int32x4_t int_32x4_sum = vmovq_n_s32(0);
      // Can't shift a Neon register to right with a non-constant shift value.
      int32x4_t int_32x4_scale = vdupq_n_s32(-scaling);
      // Assert a codition used in loop unrolling at compile-time.
      COMPILE_ASSERT(PITCH_CORR_LEN2 %4 == 0);

      for (n = 0; n < PITCH_CORR_LEN2; n += 4) {
        int16x4_t int_16x4_x = vld1_s16(&x[n]);
        int16x4_t int_16x4_in = vld1_s16(&inptr[n]);
        int32x4_t int_32x4 = vmull_s16(int_16x4_x, int_16x4_in);
        int_32x4 = vshlq_s32(int_32x4, int_32x4_scale);
        int_32x4_sum = vaddq_s32(int_32x4_sum, int_32x4);
      }

      // Use vector store to avoid long stall from data trasferring
      // from vector to general register.
      vst1q_s32(vbuff, int_32x4_sum);
      csum32 = vbuff[0] + vbuff[1];
      csum32 += vbuff[2];
      csum32 += vbuff[3];
    }
#else
    csum32 = 0;
    if(scaling == 0) {
      for (n = 0; n < PITCH_CORR_LEN2; n++) {
        csum32 += x[n] * inptr[n];
      }
    } else {
      for (n = 0; n < PITCH_CORR_LEN2; n++) {
        csum32 += (x[n] * inptr[n]) >> scaling;
      }
    }
#endif

    logcorQ8--;

    lys=Log2Q8((uint32_t)ysum32); // Q8
    lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum);

    if (csum32>0) {

      lcs=Log2Q8((uint32_t) csum32);   // 2log(csum) in Q8

      if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2
        *logcorQ8 = lcs - lys;  // log2(csum/sqrt(ysum))
      } else {
        *logcorQ8 = oneQ8;  // 1.00
      }

    } else {
      *logcorQ8 = 0;
    }
  }
}
Esempio n. 7
0
f64 dotProduct(const Size2D &_size,
               const s8 * src0Base, ptrdiff_t src0Stride,
               const s8 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width))
    {
        size.width *= size.height;
        size.height = 1;
    }

// It is possible to accumulate up to 131071 schar multiplication results in sint32 without overflow
// We process 16 elements and accumulate two new elements per step. So we could handle 131071/2*16 elements
#define DOT_INT_BLOCKSIZE 131070*8
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const s8 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const s8 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        int64x2_t ws = vmovq_n_s64(0);

        while(i + 16 <= size.width)
        {
            size_t lim = std::min(i + DOT_UINT_BLOCKSIZE, size.width) - 16;

            int32x4_t s1 = vmovq_n_s32(0);
            int32x4_t s2 = vmovq_n_s32(0);

            for (; i <= lim; i += 16)
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);

                int8x16_t vs1 = vld1q_s8(src0 + i);
                int8x16_t vs2 = vld1q_s8(src1 + i);

                int16x8_t vdot1 = vmull_s8(vget_low_s8(vs1), vget_low_s8(vs2));
                int16x8_t vdot2 = vmull_s8(vget_high_s8(vs1), vget_high_s8(vs2));

                s1 = vpadalq_s16(s1, vdot1);
                s2 = vpadalq_s16(s2, vdot2);
            }

            ws = vpadalq_s32(ws, s1);
            ws = vpadalq_s32(ws, s2);
        }

        if(i + 8 <= size.width)
        {
            int8x8_t vs1 = vld1_s8(src0 + i);
            int8x8_t vs2 = vld1_s8(src1 + i);

            ws = vpadalq_s32(ws, vpaddlq_s16(vmull_s8(vs1, vs2)));
            i += 8;
        }

        result += (double)vget_lane_s64(vadd_s64(vget_low_s64(ws), vget_high_s64(ws)), 0);

        for (; i < size.width; ++i)
            result += s32(src0[i]) * s32(src1[i]);
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}