void AudioBufferAddWithScale_NEON(const float* aInput, float aScale,
                                  float* aOutput, uint32_t aSize) {
  ASSERT_ALIGNED(aInput);
  ASSERT_ALIGNED(aOutput);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale = vmovq_n_f32(aScale);

  uint32_t dif = aSize % 16;
  aSize -= dif;
  unsigned i = 0;
  for (; i < aSize; i += 16) {
    vin0 = vld1q_f32(ADDRESS_OF(aInput, i));
    vin1 = vld1q_f32(ADDRESS_OF(aInput, i + 4));
    vin2 = vld1q_f32(ADDRESS_OF(aInput, i + 8));
    vin3 = vld1q_f32(ADDRESS_OF(aInput, i + 12));

    vout0 = vld1q_f32(ADDRESS_OF(aOutput, i));
    vout1 = vld1q_f32(ADDRESS_OF(aOutput, i + 4));
    vout2 = vld1q_f32(ADDRESS_OF(aOutput, i + 8));
    vout3 = vld1q_f32(ADDRESS_OF(aOutput, i + 12));

    vout0 = vmlaq_f32(vout0, vin0, vscale);
    vout1 = vmlaq_f32(vout1, vin1, vscale);
    vout2 = vmlaq_f32(vout2, vin2, vscale);
    vout3 = vmlaq_f32(vout3, vin3, vscale);

    vst1q_f32(ADDRESS_OF(aOutput, i), vout0);
    vst1q_f32(ADDRESS_OF(aOutput, i + 4), vout1);
    vst1q_f32(ADDRESS_OF(aOutput, i + 8), vout2);
    vst1q_f32(ADDRESS_OF(aOutput, i + 12), vout3);
  }

  for (unsigned j = 0; j < dif; ++i, ++j) {
    aOutput[i] += aInput[i] * aScale;
  }
}
void
AudioBufferInPlaceScale_NEON(float* aBlock,
                             uint32_t aChannelCount,
                             float aScale,
                             uint32_t aSize)
{
  ASSERT_ALIGNED(aBlock);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale = vmovq_n_f32(aScale);

  uint32_t totalSize = aSize * aChannelCount;
  uint32_t dif = totalSize % 16;
  totalSize -= dif;
  uint32_t i = 0;
  for (; i < totalSize; i+=16) {
    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i+4));
    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i+8));
    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i+12));

    vout0 = vmulq_f32(vin0, vscale);
    vout1 = vmulq_f32(vin1, vscale);
    vout2 = vmulq_f32(vin2, vscale);
    vout3 = vmulq_f32(vin3, vscale);

    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
    vst1q_f32(ADDRESS_OF(aBlock, i+4), vout1);
    vst1q_f32(ADDRESS_OF(aBlock, i+8), vout2);
    vst1q_f32(ADDRESS_OF(aBlock, i+12), vout3);
  }

  for (unsigned j = 0; j < dif; ++i, ++j) {
    aBlock[i] *= aScale;
  }
}
void AudioBufferInPlaceScale_NEON(float* aBlock, float* aScale,
                                  uint32_t aSize) {
  ASSERT_ALIGNED(aBlock);

  float32x4_t vin0, vin1, vin2, vin3;
  float32x4_t vout0, vout1, vout2, vout3;
  float32x4_t vscale0, vscale1, vscale2, vscale3;

  uint32_t dif = aSize % 16;
  uint32_t vectorSize = aSize - dif;
  uint32_t i = 0;
  for (; i < vectorSize; i += 16) {
    vin0 = vld1q_f32(ADDRESS_OF(aBlock, i));
    vin1 = vld1q_f32(ADDRESS_OF(aBlock, i + 4));
    vin2 = vld1q_f32(ADDRESS_OF(aBlock, i + 8));
    vin3 = vld1q_f32(ADDRESS_OF(aBlock, i + 12));

    vscale0 = vld1q_f32(ADDRESS_OF(aScale, i));
    vscale1 = vld1q_f32(ADDRESS_OF(aScale, i + 4));
    vscale2 = vld1q_f32(ADDRESS_OF(aScale, i + 8));
    vscale3 = vld1q_f32(ADDRESS_OF(aScale, i + 12));

    vout0 = vmulq_f32(vin0, vscale0);
    vout1 = vmulq_f32(vin1, vscale1);
    vout2 = vmulq_f32(vin2, vscale2);
    vout3 = vmulq_f32(vin3, vscale3);

    vst1q_f32(ADDRESS_OF(aBlock, i), vout0);
    vst1q_f32(ADDRESS_OF(aBlock, i + 4), vout1);
    vst1q_f32(ADDRESS_OF(aBlock, i + 8), vout2);
    vst1q_f32(ADDRESS_OF(aBlock, i + 12), vout3);
  }

  for (unsigned j = 0; j < dif; ++i, ++j) {
    aBlock[i] *= aScale[i];
  }
}
void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) {
    const int16_t *data = (const int16_t *)dataf;
    const int16_t *weights = (const int16_t *)weightsf;
    weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t)

    for (int i = 0; i < n; i += 4) {
        int32x4_t accum0 = { 0, 0, 0, 0 };
        int32x4_t accum1 = accum0;
        int32x4_t accum2 = accum0;
        int32x4_t accum3 = accum0;

        for (int j = 0; j < len; j += 8) {
            int16x4x2_t d0 = vld2_s16(data + j);

            int16x4x2_t w0 = vld2_s16(weights);
            int16x4x2_t w1 = vld2_s16(weights + 8);
            int16x4x2_t w2 = vld2_s16(weights + 16);
            int16x4x2_t w3 = vld2_s16(weights + 24);

            accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]);
            accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]);

            accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]);
            accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]);

            accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]);
            accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]);

            accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]);
            accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]);

            weights += 32;
        }

        int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0));
        int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1));
        int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
        int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3));
        sum0 = vpadd_s32(sum0, sum1);
        sum1 = vpadd_s32(sum2, sum3);
        int32x4_t sum = vcombine_s32(sum0, sum1);

        float32x4_t val = vcvtq_f32_s32(sum);
        val = vmulq_f32(val, vld1q_f32(weightsf + i*2));
        val = vmulq_n_f32(val, istd[0]);
        val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4));
        vst1q_f32(vals + i, val);
    }
}
Esempio n. 5
0
/**
 * @brief   vector scale & accu: A[] = alpha * B[] + beta * A[].
 *
 * @param   dst[out]    the accumulating matrix A.
 *          src[in]     the input matrix B.
 *          alpha[in]   scale of B.
 *          beta[in]    scale of A.
 *          elemCnt[in] number of elements to calc.
 *
 * @return  void.
 */
void neon_axpby(float *dst,
                const float *src,
                const float alpha,
                const float beta,
                const int elemCnt)
{
    int i;
    for (i = 0; i <= elemCnt - 16; i += 16)
    {
        float32x4_t q0 = vld1q_f32(src + i);
        float32x4_t q1 = vld1q_f32(src + i + 4);
        float32x4_t q2 = vld1q_f32(src + i + 8);
        float32x4_t q3 = vld1q_f32(src + i + 12);
        float32x4_t q4 = vld1q_f32(dst + i);
        float32x4_t q5 = vld1q_f32(dst + i + 4);
        float32x4_t q6 = vld1q_f32(dst + i + 8);
        float32x4_t q7 = vld1q_f32(dst + i + 12);
        q0 = vmulq_n_f32(q0, alpha);
        q1 = vmulq_n_f32(q1, alpha);
        q2 = vmulq_n_f32(q2, alpha);
        q3 = vmulq_n_f32(q3, alpha);
        q0 = vmlaq_n_f32(q0, q4, beta);
        q1 = vmlaq_n_f32(q1, q5, beta);
        q2 = vmlaq_n_f32(q2, q6, beta);
        q3 = vmlaq_n_f32(q3, q7, beta);
        vst1q_f32(dst + i,      q0);
        vst1q_f32(dst + i + 4,  q1);
        vst1q_f32(dst + i + 8,  q2);
        vst1q_f32(dst + i + 12, q3);
    }
    for (; i < elemCnt; i++)
    {
        float a = src[i] * alpha + dst[i] * beta;
        dst[i] = a;
    }
}
Esempio n. 6
0
void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) 
{
    unsigned int m = n >> 2;
    unsigned int k = n & 3, j;
    unsigned int l = n & (~3);

    for (j = 0; j < m; j++) {
        v4sf src = vld1q_f32(a + 4 * j);
        v4sf tem = simd_ln4f(src);
        vst1q_f32(y + 4 * j, tem);
    }

    for (j = 0; j < k; j++) {
        y[j + l] = logf(a[j + l]);
    }
}
Esempio n. 7
0
int Bias_arm::forward_inplace(Mat& bottom_top_blob) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    const float* bias_ptr = bias_data;
    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        float* ptr = bottom_top_blob.channel(q);

        float bias = bias_ptr[q];

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        float32x4_t _bias = vdupq_n_f32(bias);
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vaddq_f32(_p, _bias);
            vst1q_f32(ptr, _outp);

            ptr += 4;
        }
#endif // __ARM_NEON

        for (; remain>0; remain--)
        {
            *ptr = *ptr + bias;

            ptr++;
        }
    }

    return 0;
}
static void neon_vector_mul(const std::vector<float>& vec_a, const std::vector<float>& vec_b, std::vector<float>& vec_result)
{
	assert(vec_a.size() == vec_b.size());
	assert(vec_a.size() == vec_result.size());
	int i = 0;
	//neon process
	for (; i < (int)vec_result.size() - 3 ; i+=4)
	{
		const auto data_a = vld1q_f32(&vec_a[i]);
		const auto data_b = vld1q_f32(&vec_b[i]);
		float* dst_ptr = &vec_result[i];
		const auto data_res = vmulq_f32(data_a, data_b);
		vst1q_f32(dst_ptr, data_res);
	}
	//normal process
	for (; i < (int)vec_result.size(); i++)
	{
		vec_result[i] = vec_a[i] * vec_b[i];
	}
}
//Kernel function: saxpy
void saxpy_vector(KernelArgs* args) {

    //Setup
    const float32x4_t MASK_FALSE = vdupq_n_f32(0.f);
    const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE));
    
    //Uniforms
    
    //Fuses
    
    //Literals
    
    //Stack variables
    float32x4_t scale, x, y, result, var060, var061;
    
    //Loop over input
    uint64_t index;
    for(index = 0; index < args->N; index += 4) {
    
        //Inputs
        scale = vld1q_f32(&args->scale[index]);
        x = vld1q_f32(&args->x[index]);
        y = vld1q_f32(&args->y[index]);
        
        //Begin kernel logic
        {
        
            //>>> result = scale * x + y
            var061 = vmulq_f32(scale, x);
            var060 = vaddq_f32(var061, y);
            result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result);
        
        }
        //End kernel logic
        
        //Outputs
        vst1q_f32(&args->result[index], result);
        
    }
}
Esempio n. 10
0
void dotProd_neon(const float *data, const float *weights, float *vals, const int n, const int len, const float *istd) {
    for (int i = 0; i < n; i += 4) {
        float32x4_t accum0 = { 0.0f, 0.0f, 0.0f, 0.0f };
        float32x4_t accum1 = accum0;
        float32x4_t accum2 = accum0;
        float32x4_t accum3 = accum0;

        for (int j = 0; j < len; j += 4) {
            float32x4_t d0 = vld1q_f32(data + j);
            float32x4_t d1 = d0;
            float32x4_t d2 = d0;
            float32x4_t d3 = d0;

            float32x4_t w0 = vld1q_f32(weights);
            float32x4_t w1 = vld1q_f32(weights + 4);
            float32x4_t w2 = vld1q_f32(weights + 8);
            float32x4_t w3 = vld1q_f32(weights + 12);

            accum0 = vaddq_f32(accum0, vmulq_f32(d0, w0));
            accum1 = vaddq_f32(accum1, vmulq_f32(d1, w1));
            accum2 = vaddq_f32(accum2, vmulq_f32(d2, w2));
            accum3 = vaddq_f32(accum3, vmulq_f32(d3, w3));

            weights += 16;
        }

        float32x2_t sum0 = vpadd_f32(vget_low_f32(accum0), vget_high_f32(accum0));
        float32x2_t sum1 = vpadd_f32(vget_low_f32(accum1), vget_high_f32(accum1));
        float32x2_t sum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2));
        float32x2_t sum3 = vpadd_f32(vget_low_f32(accum3), vget_high_f32(accum3));
        sum0 = vpadd_f32(sum0, sum1);
        sum1 = vpadd_f32(sum2, sum3);
        float32x4_t sum = vcombine_f32(sum0, sum1);
        
        sum = vmulq_n_f32(sum, istd[0]);
        sum = vaddq_f32(sum, vld1q_f32(weights + n*len + i));
        vst1q_f32(vals + i, sum);
    }
}
Esempio n. 11
0
/* f32x4 add */
void mw_neon_mm_add_f32x4(float * A, int Row, int Col, float * B, float * C)
{
	float32x4_t neon_a, neon_b, neon_c;
	int size = Row * Col;
	int i = 0;
	int k = 0;

	for (i = 4; i <= size ; i+=4)
	{
		k = i - 4;
		neon_a = vld1q_f32(A + k);
		neon_b = vld1q_f32(B + k);
		neon_c = vaddq_f32(neon_a, neon_b);
		vst1q_f32(C + k, neon_c);
	}

	k = i - 4;
    for (i = 0; i < size % 4; i++)
	{
		C[k + i] = A[k + i] + B[k + i];
	}
}
Esempio n. 12
0
/* f32x4 mv mul */
void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C)
{
	int i = 0;
	int k = 0;

	float32x4_t neon_b, neon_c;
	float32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	float32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_f32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_f32(A + j);
			neon_a1 = vld1q_f32(A + j + Row);
			neon_a2 = vld1q_f32(A + j + 2 * Row);
			neon_a3 = vld1q_f32(A + j + 3 * Row);

			neon_b = vld1q_f32(B + k);
			neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0));
			neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1));
			neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2));
			neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3));

			neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c);

		}

		vst1q_f32(C + i, neon_c);
	}
}
Esempio n. 13
0
static void FilterAdaptationNEON(
    int num_partitions,
    int x_fft_buf_block_pos,
    float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1],
    float e_fft[2][PART_LEN1],
    float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1]) {
  float fft[PART_LEN2];
  int i;
  for (i = 0; i < num_partitions; i++) {
    int xPos = (i + x_fft_buf_block_pos) * PART_LEN1;
    int pos = i * PART_LEN1;
    int j;
    // Check for wrap
    if (i + x_fft_buf_block_pos >= num_partitions) {
      xPos -= num_partitions * PART_LEN1;
    }

    // Process the whole array...
    for (j = 0; j < PART_LEN; j += 4) {
      // Load x_fft_buf and e_fft.
      const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]);
      const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]);
      const float32x4_t e_fft_re = vld1q_f32(&e_fft[0][j]);
      const float32x4_t e_fft_im = vld1q_f32(&e_fft[1][j]);
      // Calculate the product of conjugate(x_fft_buf) by e_fft.
      //   re(conjugate(a) * b) = aRe * bRe + aIm * bIm
      //   im(conjugate(a) * b)=  aRe * bIm - aIm * bRe
      const float32x4_t a = vmulq_f32(x_fft_buf_re, e_fft_re);
      const float32x4_t e = vmlaq_f32(a, x_fft_buf_im, e_fft_im);
      const float32x4_t c = vmulq_f32(x_fft_buf_re, e_fft_im);
      const float32x4_t f = vmlsq_f32(c, x_fft_buf_im, e_fft_re);
      // Interleave real and imaginary parts.
      const float32x4x2_t g_n_h = vzipq_f32(e, f);
      // Store
      vst1q_f32(&fft[2 * j + 0], g_n_h.val[0]);
      vst1q_f32(&fft[2 * j + 4], g_n_h.val[1]);
    }
    // ... and fixup the first imaginary entry.
    fft[1] = MulRe(x_fft_buf[0][xPos + PART_LEN],
                   -x_fft_buf[1][xPos + PART_LEN],
                   e_fft[0][PART_LEN],
                   e_fft[1][PART_LEN]);

    aec_rdft_inverse_128(fft);
    memset(fft + PART_LEN, 0, sizeof(float) * PART_LEN);

    // fft scaling
    {
      const float scale = 2.0f / PART_LEN2;
      const float32x4_t scale_ps = vmovq_n_f32(scale);
      for (j = 0; j < PART_LEN; j += 4) {
        const float32x4_t fft_ps = vld1q_f32(&fft[j]);
        const float32x4_t fft_scale = vmulq_f32(fft_ps, scale_ps);
        vst1q_f32(&fft[j], fft_scale);
      }
    }
    aec_rdft_forward_128(fft);

    {
      const float wt1 = h_fft_buf[1][pos];
      h_fft_buf[0][pos + PART_LEN] += fft[1];
      for (j = 0; j < PART_LEN; j += 4) {
        float32x4_t wtBuf_re = vld1q_f32(&h_fft_buf[0][pos + j]);
        float32x4_t wtBuf_im = vld1q_f32(&h_fft_buf[1][pos + j]);
        const float32x4_t fft0 = vld1q_f32(&fft[2 * j + 0]);
        const float32x4_t fft4 = vld1q_f32(&fft[2 * j + 4]);
        const float32x4x2_t fft_re_im = vuzpq_f32(fft0, fft4);
        wtBuf_re = vaddq_f32(wtBuf_re, fft_re_im.val[0]);
        wtBuf_im = vaddq_f32(wtBuf_im, fft_re_im.val[1]);

        vst1q_f32(&h_fft_buf[0][pos + j], wtBuf_re);
        vst1q_f32(&h_fft_buf[1][pos + j], wtBuf_im);
      }
      h_fft_buf[1][pos] = wt1;
    }
  }
}
void
AudioBlockPanStereoToStereo_NEON(const float aInputL[WEBAUDIO_BLOCK_SIZE],
                                 const float aInputR[WEBAUDIO_BLOCK_SIZE],
                                 float aGainL, float aGainR, bool aIsOnTheLeft,
                                 float aOutputL[WEBAUDIO_BLOCK_SIZE],
                                 float aOutputR[WEBAUDIO_BLOCK_SIZE])
{
  ASSERT_ALIGNED(aInputL);
  ASSERT_ALIGNED(aInputR);
  ASSERT_ALIGNED(aOutputL);
  ASSERT_ALIGNED(aOutputR);

  float32x4_t vinL0, vinL1;
  float32x4_t vinR0, vinR1;
  float32x4_t voutL0, voutL1;
  float32x4_t voutR0, voutR1;
  float32x4_t vscaleL = vmovq_n_f32(aGainL);
  float32x4_t vscaleR = vmovq_n_f32(aGainR);

  if (aIsOnTheLeft) {
    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) {
      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i+4));

      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i+4));

      voutL0 = vmlaq_f32(vinL0, vinR0, vscaleL);
      voutL1 = vmlaq_f32(vinL1, vinR1, vscaleL);

      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
      vst1q_f32(ADDRESS_OF(aOutputL, i+4), voutL1);

      voutR0 = vmulq_f32(vinR0, vscaleR);
      voutR1 = vmulq_f32(vinR1, vscaleR);

      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
      vst1q_f32(ADDRESS_OF(aOutputR, i+4), voutR1);
    }
  } else {
    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i+=8) {
      vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
      vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i+4));

      vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
      vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i+4));

      voutL0 = vmulq_f32(vinL0, vscaleL);
      voutL1 = vmulq_f32(vinL1, vscaleL);

      vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
      vst1q_f32(ADDRESS_OF(aOutputL, i+4), voutL1);

      voutR0 = vmlaq_f32(vinR0, vinL0, vscaleR);
      voutR1 = vmlaq_f32(vinR1, vinL1, vscaleR);

      vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
      vst1q_f32(ADDRESS_OF(aOutputR, i+4), voutR1);
    }
  }
}
Esempio n. 15
0
static void OverdriveAndSuppressNEON(AecCore* aec,
                                     float hNl[PART_LEN1],
                                     const float hNlFb,
                                     float efw[2][PART_LEN1]) {
  int i;
  const float32x4_t vec_hNlFb = vmovq_n_f32(hNlFb);
  const float32x4_t vec_one = vdupq_n_f32(1.0f);
  const float32x4_t vec_minus_one = vdupq_n_f32(-1.0f);
  const float32x4_t vec_overDriveSm = vmovq_n_f32(aec->overDriveSm);

  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    // Weight subbands
    float32x4_t vec_hNl = vld1q_f32(&hNl[i]);
    const float32x4_t vec_weightCurve = vld1q_f32(&WebRtcAec_weightCurve[i]);
    const uint32x4_t bigger = vcgtq_f32(vec_hNl, vec_hNlFb);
    const float32x4_t vec_weightCurve_hNlFb = vmulq_f32(vec_weightCurve,
                                                        vec_hNlFb);
    const float32x4_t vec_one_weightCurve = vsubq_f32(vec_one, vec_weightCurve);
    const float32x4_t vec_one_weightCurve_hNl = vmulq_f32(vec_one_weightCurve,
                                                          vec_hNl);
    const uint32x4_t vec_if0 = vandq_u32(vmvnq_u32(bigger),
                                         vreinterpretq_u32_f32(vec_hNl));
    const float32x4_t vec_one_weightCurve_add =
        vaddq_f32(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl);
    const uint32x4_t vec_if1 =
        vandq_u32(bigger, vreinterpretq_u32_f32(vec_one_weightCurve_add));

    vec_hNl = vreinterpretq_f32_u32(vorrq_u32(vec_if0, vec_if1));

    {
      const float32x4_t vec_overDriveCurve =
          vld1q_f32(&WebRtcAec_overDriveCurve[i]);
      const float32x4_t vec_overDriveSm_overDriveCurve =
          vmulq_f32(vec_overDriveSm, vec_overDriveCurve);
      vec_hNl = vpowq_f32(vec_hNl, vec_overDriveSm_overDriveCurve);
      vst1q_f32(&hNl[i], vec_hNl);
    }

    // Suppress error signal
    {
      float32x4_t vec_efw_re = vld1q_f32(&efw[0][i]);
      float32x4_t vec_efw_im = vld1q_f32(&efw[1][i]);
      vec_efw_re = vmulq_f32(vec_efw_re, vec_hNl);
      vec_efw_im = vmulq_f32(vec_efw_im, vec_hNl);

      // Ooura fft returns incorrect sign on imaginary component. It matters
      // here because we are making an additive change with comfort noise.
      vec_efw_im = vmulq_f32(vec_efw_im, vec_minus_one);
      vst1q_f32(&efw[0][i], vec_efw_re);
      vst1q_f32(&efw[1][i], vec_efw_im);
    }
  }

  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    // Weight subbands
    if (hNl[i] > hNlFb) {
      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
               (1 - WebRtcAec_weightCurve[i]) * hNl[i];
    }

    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);

    // Suppress error signal
    efw[0][i] *= hNl[i];
    efw[1][i] *= hNl[i];

    // Ooura fft returns incorrect sign on imaginary component. It matters
    // here because we are making an additive change with comfort noise.
    efw[1][i] *= -1;
  }
}
Esempio n. 16
0
static void rftbsub_128_neon(float* a) {
  const float* c = rdft_w + 32;
  int j1, j2;
  const float32x4_t mm_half = vdupq_n_f32(0.5f);

  a[1] = -a[1];
  // Vectorized code (four at once).
  //    Note: commented number are indexes for the first iteration of the loop.
  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    // Load 'wk'.
    const float32x4_t c_j1 = vld1q_f32(&c[j1]);         //  1,  2,  3,  4,
    const float32x4_t c_k1 = vld1q_f32(&c[29 - j1]);    // 28, 29, 30, 31,
    const float32x4_t wkrt = vsubq_f32(mm_half, c_k1);  // 28, 29, 30, 31,
    const float32x4_t wkr_ = reverse_order_f32x4(wkrt); // 31, 30, 29, 28,
    const float32x4_t wki_ = c_j1;                      //  1,  2,  3,  4,
    // Load and shuffle 'a'.
    //   2,   4,   6,   8,   3,   5,   7,   9
    float32x4x2_t a_j2_p = vld2q_f32(&a[0 + j2]);
    // 120, 122, 124, 126, 121, 123, 125, 127,
    const float32x4x2_t k2_0_4 = vld2q_f32(&a[122 - j2]);
    // 126, 124, 122, 120
    const float32x4_t a_k2_p0 = reverse_order_f32x4(k2_0_4.val[0]);
    // 127, 125, 123, 121
    const float32x4_t a_k2_p1 = reverse_order_f32x4(k2_0_4.val[1]);
    // Calculate 'x'.
    const float32x4_t xr_ = vsubq_f32(a_j2_p.val[0], a_k2_p0);
    // 2-126, 4-124, 6-122, 8-120,
    const float32x4_t xi_ = vaddq_f32(a_j2_p.val[1], a_k2_p1);
    // 3-127, 5-125, 7-123, 9-121,
    // Calculate product into 'y'.
    //    yr = wkr * xr - wki * xi;
    //    yi = wkr * xi + wki * xr;
    const float32x4_t a_ = vmulq_f32(wkr_, xr_);
    const float32x4_t b_ = vmulq_f32(wki_, xi_);
    const float32x4_t c_ = vmulq_f32(wkr_, xi_);
    const float32x4_t d_ = vmulq_f32(wki_, xr_);
    const float32x4_t yr_ = vaddq_f32(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
    const float32x4_t yi_ = vsubq_f32(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
                                                // Update 'a'.
                                                //    a[j2 + 0] -= yr;
                                                //    a[j2 + 1] -= yi;
                                                //    a[k2 + 0] += yr;
                                                //    a[k2 + 1] -= yi;
    // 126, 124, 122, 120,
    const float32x4_t a_k2_p0n = vaddq_f32(a_k2_p0, yr_);
    // 127, 125, 123, 121,
    const float32x4_t a_k2_p1n = vsubq_f32(yi_, a_k2_p1);
    // Shuffle in right order and store.
    //   2,   3,   4,   5,   6,   7,   8,   9,
    const float32x4_t a_k2_p0nr = vrev64q_f32(a_k2_p0n);
    const float32x4_t a_k2_p1nr = vrev64q_f32(a_k2_p1n);
    // 124, 125, 126, 127, 120, 121, 122, 123
    const float32x4x2_t a_k2_n = vzipq_f32(a_k2_p0nr, a_k2_p1nr);
    //   2,   4,   6,   8,
    a_j2_p.val[0] = vsubq_f32(a_j2_p.val[0], yr_);
    //   3,   5,   7,   9,
    a_j2_p.val[1] = vsubq_f32(yi_, a_j2_p.val[1]);
    //   2,   3,   4,   5,   6,   7,   8,   9,
    vst2q_f32(&a[0 + j2], a_j2_p);

    vst1q_f32(&a[122 - j2], a_k2_n.val[1]);
    vst1q_f32(&a[126 - j2], a_k2_n.val[0]);
  }

  // Scalar code for the remaining items.
  for (; j2 < 64; j1 += 1, j2 += 2) {
    const int k2 = 128 - j2;
    const int k1 = 32 - j1;
    const float wkr = 0.5f - c[k1];
    const float wki = c[j1];
    const float xr = a[j2 + 0] - a[k2 + 0];
    const float xi = a[j2 + 1] + a[k2 + 1];
    const float yr = wkr * xr + wki * xi;
    const float yi = wkr * xi - wki * xr;
    a[j2 + 0] = a[j2 + 0] - yr;
    a[j2 + 1] = yi - a[j2 + 1];
    a[k2 + 0] = yr + a[k2 + 0];
    a[k2 + 1] = yi - a[k2 + 1];
  }
  a[65] = -a[65];
}
Esempio n. 17
0
static void
thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
{
    int i, j;
    Size roi = _src.size();
    roi.width *= _src.channels();
    const float* src = _src.ptr<float>();
    float* dst = _dst.ptr<float>();
    size_t src_step = _src.step/sizeof(src[0]);
    size_t dst_step = _dst.step/sizeof(dst[0]);

#if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);
#endif

    if( _src.isContinuous() && _dst.isContinuous() )
    {
        roi.width *= roi.height;
        roi.height = 1;
    }

#ifdef HAVE_TEGRA_OPTIMIZATION
    if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))
        return;
#endif

#if defined(HAVE_IPP)
    CV_IPP_CHECK()
    {
        IppiSize sz = { roi.width, roi.height };
        switch( type )
        {
        case THRESH_TRUNC:
            if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh))
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
            setIppErrorStatus();
            break;
        case THRESH_TOZERO:
            if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0))
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
            setIppErrorStatus();
            break;
        case THRESH_TOZERO_INV:
            if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))
            {
                CV_IMPL_ADD(CV_IMPL_IPP);
                return;
            }
            setIppErrorStatus();
            break;
        }
    }
#endif

    switch( type )
    {
        case THRESH_BINARY:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmpgt_ps( v0, thresh4 );
                        v1 = _mm_cmpgt_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#elif CV_NEON
                float32x4_t v_thresh = vdupq_n_f32(thresh);
                uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));

                for( ; j <= roi.width - 4; j += 4 )
                {
                    float32x4_t v_src = vld1q_f32(src + j);
                    uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh), v_maxval);
                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] > thresh ? maxval : 0;
            }
            break;

        case THRESH_BINARY_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmple_ps( v0, thresh4 );
                        v1 = _mm_cmple_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#elif CV_NEON
                float32x4_t v_thresh = vdupq_n_f32(thresh);
                uint32x4_t v_maxval = vreinterpretq_u32_f32(vdupq_n_f32(maxval));

                for( ; j <= roi.width - 4; j += 4 )
                {
                    float32x4_t v_src = vld1q_f32(src + j);
                    uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh), v_maxval);
                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
                }
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] <= thresh ? maxval : 0;
            }
            break;

        case THRESH_TRUNC:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_min_ps( v0, thresh4 );
                        v1 = _mm_min_ps( v1, thresh4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#elif CV_NEON
                float32x4_t v_thresh = vdupq_n_f32(thresh);

                for( ; j <= roi.width - 4; j += 4 )
                    vst1q_f32(dst + j, vminq_f32(vld1q_f32(src + j), v_thresh));
#endif

                for( ; j < roi.width; j++ )
                    dst[j] = std::min(src[j], thresh);
            }
            break;

        case THRESH_TOZERO:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#elif CV_NEON
                float32x4_t v_thresh = vdupq_n_f32(thresh);

                for( ; j <= roi.width - 4; j += 4 )
                {
                    float32x4_t v_src = vld1q_f32(src + j);
                    uint32x4_t v_dst = vandq_u32(vcgtq_f32(v_src, v_thresh),
                                                 vreinterpretq_u32_f32(v_src));
                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
                }
#endif

                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v > thresh ? v : 0;
                }
            }
            break;

        case THRESH_TOZERO_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
            {
                j = 0;
#if CV_SSE2
                if( useSIMD )
                {
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                    {
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                    }
                }
#elif CV_NEON
                float32x4_t v_thresh = vdupq_n_f32(thresh);

                for( ; j <= roi.width - 4; j += 4 )
                {
                    float32x4_t v_src = vld1q_f32(src + j);
                    uint32x4_t v_dst = vandq_u32(vcleq_f32(v_src, v_thresh),
                                                 vreinterpretq_u32_f32(v_src));
                    vst1q_f32(dst + j, vreinterpretq_f32_u32(v_dst));
                }
#endif
                for( ; j < roi.width; j++ )
                {
                    float v = src[j];
                    dst[j] = v <= thresh ? v : 0;
                }
            }
            break;
        default:
            return CV_Error( CV_StsBadArg, "" );
    }
}
Esempio n. 18
0
inline void vst1q(f32 * ptr, const float32x4_t & v) { return vst1q_f32(ptr, v); }
Esempio n. 19
0
// use ARM Neon extensions (unrolled loop)
// NOTE: unrolling doesn't show any appreciable performance difference
void dotprod_cccf_execute_neon4(dotprod_cccf    _q,
                                float complex * _x,
                                float complex * _y)
{
    // type cast input as floating point array
    float * x = (float*) _x;

    // double effective length
    unsigned int n = 2*_q->n;

    // first cut: ...
    float32x4_t v0,  v1,  v2,  v3;   // input vectors
    float32x4_t hi0, hi1, hi2, hi3;  // coefficients vectors (real)
    float32x4_t hq0, hq1, hq2, hq3;  // coefficients vectors (imag)
    float32x4_t ci0, ci1, ci2, ci3;  // output multiplications (v * hi)
    float32x4_t cq0, cq1, cq2, cq3;  // output multiplications (v * hq)

    // load zeros into sum registers
    float zeros[4] = {0,0,0,0};
    float32x4_t sumi = vld1q_f32(zeros);
    float32x4_t sumq = vld1q_f32(zeros);

    // r = 4*floor(n/16)
    unsigned int r = (n >> 4) << 2;

    //
    unsigned int i;
    for (i=0; i<r; i+=4) {
        // load inputs into register (unaligned)
        v0 = vld1q_f32(&x[4*i+0]);
        v1 = vld1q_f32(&x[4*i+4]);
        v2 = vld1q_f32(&x[4*i+8]);
        v3 = vld1q_f32(&x[4*i+12]);

        // load real coefficients into registers (aligned)
        hi0 = vld1q_f32(&_q->hi[4*i+0]);
        hi1 = vld1q_f32(&_q->hi[4*i+4]);
        hi2 = vld1q_f32(&_q->hi[4*i+8]);
        hi3 = vld1q_f32(&_q->hi[4*i+12]);

        // load real coefficients into registers (aligned)
        hq0 = vld1q_f32(&_q->hq[4*i+0]);
        hq1 = vld1q_f32(&_q->hq[4*i+4]);
        hq2 = vld1q_f32(&_q->hq[4*i+8]);
        hq3 = vld1q_f32(&_q->hq[4*i+12]);
        
        // compute parallel multiplications (real)
        ci0 = vmulq_f32(v0, hi0);
        ci1 = vmulq_f32(v1, hi1);
        ci2 = vmulq_f32(v2, hi2);
        ci3 = vmulq_f32(v3, hi3);

        // compute parallel multiplications (imag)
        cq0 = vmulq_f32(v0, hq0);
        cq1 = vmulq_f32(v1, hq1);
        cq2 = vmulq_f32(v2, hq2);
        cq3 = vmulq_f32(v3, hq3);

        // accumulate
        sumi = vaddq_f32(sumi, ci0);    sumq = vaddq_f32(sumq, cq0);
        sumi = vaddq_f32(sumi, ci1);    sumq = vaddq_f32(sumq, cq1);
        sumi = vaddq_f32(sumi, ci2);    sumq = vaddq_f32(sumq, cq2);
        sumi = vaddq_f32(sumi, ci3);    sumq = vaddq_f32(sumq, cq3);
    }

    // unload
    float wi[4];
    float wq[4];
    vst1q_f32(wi, sumi);
    vst1q_f32(wq, sumq);

    // fold down (add/sub)
    float complex total = 
        ((wi[0] - wq[1]) + (wi[2] - wq[3])) +
        ((wi[1] + wq[0]) + (wi[3] + wq[2])) * _Complex_I;

    // cleanup (note: n _must_ be even)
    // TODO : clean this method up
    for (i=2*r; i<_q->n; i++) {
        total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I );
    }

    // set return value
    *_y = total;
}
Esempio n. 20
0
// Updates the following smoothed  Power Spectral Densities (PSD):
//  - sd  : near-end
//  - se  : residual echo
//  - sx  : far-end
//  - sde : cross-PSD of near-end and residual echo
//  - sxd : cross-PSD of near-end and far-end
//
// In addition to updating the PSDs, also the filter diverge state is determined
// upon actions are taken.
static void SmoothedPSD(AecCore* aec,
                        float efw[2][PART_LEN1],
                        float dfw[2][PART_LEN1],
                        float xfw[2][PART_LEN1],
                        int* extreme_filter_divergence) {
  // Power estimate smoothing coefficients.
  const float* ptrGCoh = aec->extended_filter_enabled
      ? WebRtcAec_kExtendedSmoothingCoefficients[aec->mult - 1]
      : WebRtcAec_kNormalSmoothingCoefficients[aec->mult - 1];
  int i;
  float sdSum = 0, seSum = 0;
  const float32x4_t vec_15 =  vdupq_n_f32(WebRtcAec_kMinFarendPSD);
  float32x4_t vec_sdSum = vdupq_n_f32(0.0f);
  float32x4_t vec_seSum = vdupq_n_f32(0.0f);

  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const float32x4_t vec_dfw0 = vld1q_f32(&dfw[0][i]);
    const float32x4_t vec_dfw1 = vld1q_f32(&dfw[1][i]);
    const float32x4_t vec_efw0 = vld1q_f32(&efw[0][i]);
    const float32x4_t vec_efw1 = vld1q_f32(&efw[1][i]);
    const float32x4_t vec_xfw0 = vld1q_f32(&xfw[0][i]);
    const float32x4_t vec_xfw1 = vld1q_f32(&xfw[1][i]);
    float32x4_t vec_sd = vmulq_n_f32(vld1q_f32(&aec->sd[i]), ptrGCoh[0]);
    float32x4_t vec_se = vmulq_n_f32(vld1q_f32(&aec->se[i]), ptrGCoh[0]);
    float32x4_t vec_sx = vmulq_n_f32(vld1q_f32(&aec->sx[i]), ptrGCoh[0]);
    float32x4_t vec_dfw_sumsq = vmulq_f32(vec_dfw0, vec_dfw0);
    float32x4_t vec_efw_sumsq = vmulq_f32(vec_efw0, vec_efw0);
    float32x4_t vec_xfw_sumsq = vmulq_f32(vec_xfw0, vec_xfw0);

    vec_dfw_sumsq = vmlaq_f32(vec_dfw_sumsq, vec_dfw1, vec_dfw1);
    vec_efw_sumsq = vmlaq_f32(vec_efw_sumsq, vec_efw1, vec_efw1);
    vec_xfw_sumsq = vmlaq_f32(vec_xfw_sumsq, vec_xfw1, vec_xfw1);
    vec_xfw_sumsq = vmaxq_f32(vec_xfw_sumsq, vec_15);
    vec_sd = vmlaq_n_f32(vec_sd, vec_dfw_sumsq, ptrGCoh[1]);
    vec_se = vmlaq_n_f32(vec_se, vec_efw_sumsq, ptrGCoh[1]);
    vec_sx = vmlaq_n_f32(vec_sx, vec_xfw_sumsq, ptrGCoh[1]);

    vst1q_f32(&aec->sd[i], vec_sd);
    vst1q_f32(&aec->se[i], vec_se);
    vst1q_f32(&aec->sx[i], vec_sx);

    {
      float32x4x2_t vec_sde = vld2q_f32(&aec->sde[i][0]);
      float32x4_t vec_dfwefw0011 = vmulq_f32(vec_dfw0, vec_efw0);
      float32x4_t vec_dfwefw0110 = vmulq_f32(vec_dfw0, vec_efw1);
      vec_sde.val[0] = vmulq_n_f32(vec_sde.val[0], ptrGCoh[0]);
      vec_sde.val[1] = vmulq_n_f32(vec_sde.val[1], ptrGCoh[0]);
      vec_dfwefw0011 = vmlaq_f32(vec_dfwefw0011, vec_dfw1, vec_efw1);
      vec_dfwefw0110 = vmlsq_f32(vec_dfwefw0110, vec_dfw1, vec_efw0);
      vec_sde.val[0] = vmlaq_n_f32(vec_sde.val[0], vec_dfwefw0011, ptrGCoh[1]);
      vec_sde.val[1] = vmlaq_n_f32(vec_sde.val[1], vec_dfwefw0110, ptrGCoh[1]);
      vst2q_f32(&aec->sde[i][0], vec_sde);
    }

    {
      float32x4x2_t vec_sxd = vld2q_f32(&aec->sxd[i][0]);
      float32x4_t vec_dfwxfw0011 = vmulq_f32(vec_dfw0, vec_xfw0);
      float32x4_t vec_dfwxfw0110 = vmulq_f32(vec_dfw0, vec_xfw1);
      vec_sxd.val[0] = vmulq_n_f32(vec_sxd.val[0], ptrGCoh[0]);
      vec_sxd.val[1] = vmulq_n_f32(vec_sxd.val[1], ptrGCoh[0]);
      vec_dfwxfw0011 = vmlaq_f32(vec_dfwxfw0011, vec_dfw1, vec_xfw1);
      vec_dfwxfw0110 = vmlsq_f32(vec_dfwxfw0110, vec_dfw1, vec_xfw0);
      vec_sxd.val[0] = vmlaq_n_f32(vec_sxd.val[0], vec_dfwxfw0011, ptrGCoh[1]);
      vec_sxd.val[1] = vmlaq_n_f32(vec_sxd.val[1], vec_dfwxfw0110, ptrGCoh[1]);
      vst2q_f32(&aec->sxd[i][0], vec_sxd);
    }

    vec_sdSum = vaddq_f32(vec_sdSum, vec_sd);
    vec_seSum = vaddq_f32(vec_seSum, vec_se);
  }
  {
    float32x2_t vec_sdSum_total;
    float32x2_t vec_seSum_total;
    // A B C D
    vec_sdSum_total = vpadd_f32(vget_low_f32(vec_sdSum),
                                vget_high_f32(vec_sdSum));
    vec_seSum_total = vpadd_f32(vget_low_f32(vec_seSum),
                                vget_high_f32(vec_seSum));
    // A+B C+D
    vec_sdSum_total = vpadd_f32(vec_sdSum_total, vec_sdSum_total);
    vec_seSum_total = vpadd_f32(vec_seSum_total, vec_seSum_total);
    // A+B+C+D A+B+C+D
    sdSum = vget_lane_f32(vec_sdSum_total, 0);
    seSum = vget_lane_f32(vec_seSum_total, 0);
  }

  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    aec->sd[i] = ptrGCoh[0] * aec->sd[i] +
                 ptrGCoh[1] * (dfw[0][i] * dfw[0][i] + dfw[1][i] * dfw[1][i]);
    aec->se[i] = ptrGCoh[0] * aec->se[i] +
                 ptrGCoh[1] * (efw[0][i] * efw[0][i] + efw[1][i] * efw[1][i]);
    // We threshold here to protect against the ill-effects of a zero farend.
    // The threshold is not arbitrarily chosen, but balances protection and
    // adverse interaction with the algorithm's tuning.
    // TODO(bjornv): investigate further why this is so sensitive.
    aec->sx[i] =
        ptrGCoh[0] * aec->sx[i] +
        ptrGCoh[1] * WEBRTC_SPL_MAX(
            xfw[0][i] * xfw[0][i] + xfw[1][i] * xfw[1][i],
            WebRtcAec_kMinFarendPSD);

    aec->sde[i][0] =
        ptrGCoh[0] * aec->sde[i][0] +
        ptrGCoh[1] * (dfw[0][i] * efw[0][i] + dfw[1][i] * efw[1][i]);
    aec->sde[i][1] =
        ptrGCoh[0] * aec->sde[i][1] +
        ptrGCoh[1] * (dfw[0][i] * efw[1][i] - dfw[1][i] * efw[0][i]);

    aec->sxd[i][0] =
        ptrGCoh[0] * aec->sxd[i][0] +
        ptrGCoh[1] * (dfw[0][i] * xfw[0][i] + dfw[1][i] * xfw[1][i]);
    aec->sxd[i][1] =
        ptrGCoh[0] * aec->sxd[i][1] +
        ptrGCoh[1] * (dfw[0][i] * xfw[1][i] - dfw[1][i] * xfw[0][i]);

    sdSum += aec->sd[i];
    seSum += aec->se[i];
  }

  // Divergent filter safeguard update.
  aec->divergeState = (aec->divergeState ? 1.05f : 1.0f) * seSum > sdSum;

  // Signal extreme filter divergence if the error is significantly larger
  // than the nearend (13 dB).
  *extreme_filter_divergence = (seSum > (19.95f * sdSum));
}
Esempio n. 21
0
void nnp_conv1x1_upto_4x4__neon(
	uint32_t input_channels_subblock_size,
	uint32_t output_channels_subblock_size,
	size_t input_channels,
	size_t image_size,
	const float* input,
	const float* kernel,
	float* output)
{
	const float*restrict input0 = input;
	const float*restrict input1 = input_channels_subblock_size > 1 ? input0 + image_size : input0;
	const float*restrict input2 = input_channels_subblock_size > 2 ? input1 + image_size : input1;
	const float*restrict input3 = input_channels_subblock_size > 3 ? input2 + image_size : input2;

	const float*restrict kernel0 = kernel;
	const float*restrict kernel1 = output_channels_subblock_size > 1 ? kernel0 + input_channels : kernel0;
	const float*restrict kernel2 = output_channels_subblock_size > 2 ? kernel1 + input_channels : kernel1;
	const float*restrict kernel3 = output_channels_subblock_size > 3 ? kernel2 + input_channels : kernel2;

	float32x4_t vkernel0x = vld1q_dup_f32(kernel0);
	float32x4_t vkernel1x = vld1q_dup_f32(kernel1);
	float32x4_t vkernel2x = vld1q_dup_f32(kernel2);
	float32x4_t vkernel3x = vld1q_dup_f32(kernel3);
	if (input_channels_subblock_size > 1) {
		vkernel0x = vld1q_lane_f32(kernel0 + 1, vkernel0x, 1);
		vkernel1x = vld1q_lane_f32(kernel1 + 1, vkernel1x, 1);
		vkernel2x = vld1q_lane_f32(kernel2 + 1, vkernel2x, 1);
		vkernel3x = vld1q_lane_f32(kernel3 + 1, vkernel3x, 1);
		if (input_channels_subblock_size > 2) {
			vkernel0x = vld1q_lane_f32(kernel0 + 2, vkernel0x, 2);
			vkernel1x = vld1q_lane_f32(kernel1 + 2, vkernel1x, 2);
			vkernel2x = vld1q_lane_f32(kernel2 + 2, vkernel2x, 2);
			vkernel3x = vld1q_lane_f32(kernel3 + 2, vkernel3x, 2);
			if (input_channels_subblock_size > 3) {
				vkernel0x = vld1q_lane_f32(kernel0 + 3, vkernel0x, 3);
				vkernel1x = vld1q_lane_f32(kernel1 + 3, vkernel1x, 3);
				vkernel2x = vld1q_lane_f32(kernel2 + 3, vkernel2x, 3);
				vkernel3x = vld1q_lane_f32(kernel3 + 3, vkernel3x, 3);
			}
		}
	}

	float*restrict output0 = output;
	float*restrict output1 = output_channels_subblock_size > 1 ? output0 + image_size : output0;
	float*restrict output2 = output_channels_subblock_size > 2 ? output1 + image_size : output1;
	float*restrict output3 = output_channels_subblock_size > 3 ? output2 + image_size : output2;
	while (image_size >= 4) {
		float32x4_t voutput0 = vld1q_f32(output0);
		float32x4_t voutput1 = vld1q_f32(output1);
		float32x4_t voutput2 = vld1q_f32(output2);
		float32x4_t voutput3 = vld1q_f32(output3);

		const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4;
		voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		if (input_channels_subblock_size > 1) {
			const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4;
			voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
			voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
			voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
			voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

			if (input_channels_subblock_size > 2) {
				const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4;
				voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
				voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
				voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
				voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

				if (input_channels_subblock_size > 3) {
					const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4;
					voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
					voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
					voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
					voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));
				}
			}
		}

		vst1q_f32(output0, voutput0); output0 += 4;
		if (output_channels_subblock_size > 1) {
			vst1q_f32(output1, voutput1); output1 += 4;
			if (output_channels_subblock_size > 2) {
				vst1q_f32(output2, voutput2); output2 += 4;
				if (output_channels_subblock_size > 3) {
					vst1q_f32(output3, voutput3); output3 += 4;
				}
			}
		}

		image_size -= 4;
	}
	if (image_size >= 2) {
		float32x2_t voutput0 = vld1_f32(output0);
		float32x2_t voutput1 = vld1_f32(output1);
		float32x2_t voutput2 = vld1_f32(output2);
		float32x2_t voutput3 = vld1_f32(output3);

		const float32x2_t vinput0 = vld1_f32(input0); input0 += 2;
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		if (input_channels_subblock_size > 1) {
			const float32x2_t vinput1 = vld1_f32(input1); input1 += 2;
			voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
			voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
			voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
			voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

			if (input_channels_subblock_size > 2) {
				const float32x2_t vinput2 = vld1_f32(input2); input2 += 2;
				voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
				voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
				voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
				voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

				if (input_channels_subblock_size > 3) {
					const float32x2_t vinput3 = vld1_f32(input3); input3 += 2;
					voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
					voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
					voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
					voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));
				}
			}
		}

		vst1_f32(output0, voutput0); output0 += 2;
		if (output_channels_subblock_size > 1) {
			vst1_f32(output1, voutput1); output1 += 2;
			if (output_channels_subblock_size > 2) {
				vst1_f32(output2, voutput2); output2 += 2;
				if (output_channels_subblock_size > 3) {
					vst1_f32(output3, voutput3); output3 += 2;
				}
			}
		}

		image_size -= 2;
	}
	if (image_size != 0) {
		float32x2_t voutput0 = vld1_dup_f32(output0);
		float32x2_t voutput1 = vld1_dup_f32(output1);
		float32x2_t voutput2 = vld1_dup_f32(output2);
		float32x2_t voutput3 = vld1_dup_f32(output3);

		const float32x2_t vinput0 = vld1_dup_f32(input0);
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		if (input_channels_subblock_size > 1) {
			const float32x2_t vinput1 = vld1_dup_f32(input1);
			voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
			voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
			voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
			voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

			if (input_channels_subblock_size > 2) {
				const float32x2_t vinput2 = vld1_dup_f32(input2);
				voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
				voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
				voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
				voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

				if (input_channels_subblock_size > 3) {
					const float32x2_t vinput3 = vld1_dup_f32(input3);
					voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
					voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
					voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
					voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));
				}
			}
		}

		vst1_lane_f32(output0, voutput0, 0);
		if (output_channels_subblock_size > 1) {
			vst1_lane_f32(output1, voutput1, 0);
			if (output_channels_subblock_size > 2) {
				vst1_lane_f32(output2, voutput2, 0);
				if (output_channels_subblock_size > 3) {
					vst1_lane_f32(output3, voutput3, 0);
				}
			}
		}
	}
}
Esempio n. 22
0
void sEnv_process(HvBase *_c, SignalEnvelope *o, hv_bInf_t bIn,
		void (*sendMessage)(HvBase *, int, const HvMessage *)) {
#if HV_SIMD_AVX
  _mm256_stream_ps(o->buffer+o->numSamplesInBuffer, _mm256_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    int n4 = o->windowSize & ~HV_N_SIMD_MASK;
    __m256 sum = _mm256_setzero_ps();
    while (n4) {
      __m256 x = _mm256_load_ps(o->buffer + n4 - HV_N_SIMD);
      __m256 h = _mm256_load_ps(o->hanningWeights + n4 - HV_N_SIMD);
      x = _mm256_mul_ps(x, h);
      sum = _mm256_add_ps(sum, x);
      n4 -= HV_N_SIMD;
    }
    sum = _mm256_hadd_ps(sum,sum); // horizontal sum
    sum = _mm256_hadd_ps(sum,sum);
    sEnv_sendMessage(_c, o, sum[0]+sum[4], sendMessage); // updates numSamplesInBuffer
  }
#elif HV_SIMD_SSE
  _mm_stream_ps(o->buffer+o->numSamplesInBuffer, _mm_mul_ps(bIn,bIn)); // store bIn^2, no need to cache block
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    int n4 = o->windowSize & ~HV_N_SIMD_MASK;
    __m128 sum = _mm_setzero_ps();
    while (n4) {
      __m128 x = _mm_load_ps(o->buffer + n4 - HV_N_SIMD);
      __m128 h = _mm_load_ps(o->hanningWeights + n4 - HV_N_SIMD);
      x = _mm_mul_ps(x, h);
      sum = _mm_add_ps(sum, x);
      n4 -= HV_N_SIMD;
    }
    sum = _mm_hadd_ps(sum,sum); // horizontal sum
    sum = _mm_hadd_ps(sum,sum);
    sEnv_sendMessage(_c, o, sum[0], sendMessage);
  }
#elif HV_SIMD_NEON
  vst1q_f32(o->buffer+o->numSamplesInBuffer, vmulq_f32(bIn,bIn)); // store bIn^2, no need to cache block
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    int n4 = o->windowSize & ~HV_N_SIMD_MASK;
    float32x4_t sum = vdupq_n_f32(0.0f);
    while (n4) {
      float32x4_t x = vld1q_f32(o->buffer + n4 - HV_N_SIMD);
      float32x4_t h = vld1q_f32(o->hanningWeights + n4 - HV_N_SIMD);
      x = vmulq_f32(x, h);
      sum = vaddq_f32(sum, x);
      n4 -= HV_N_SIMD;
    }
    sEnv_sendMessage(_c, o, sum[0]+sum[1]+sum[2]+sum[3], sendMessage);
  }
#else // HV_SIMD_NONE
  o->buffer[o->numSamplesInBuffer] = (bIn*bIn);
  o->numSamplesInBuffer += HV_N_SIMD;

  if (o->numSamplesInBuffer >= o->windowSize) {
    float sum = 0.0f;
    for (int i = 0; i < o->windowSize; ++i) {
      sum += (o->hanningWeights[i] * o->buffer[i]);
    }
    sEnv_sendMessage(_c, o, sum, sendMessage);
  }
#endif
}
Esempio n. 23
0
void meanStdDev(const Size2D &size,
                const u16 * srcBase, ptrdiff_t srcStride,
                f32 * pMean, f32 * pStdDev)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3;
    f64 fsum = 0.0f, fsqsum = 0.0f;

    f32 arsum[8];
    uint32x4_t v_zero = vdupq_n_u32(0u), v_sum;
    float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum;

    for (size_t i = 0; i < size.height; ++i)
    {
        const u16 * src = internal::getRowPtr(srcBase, srcStride, i);
        size_t j = 0u;

        while (j < roiw4)
        {
            size_t blockSize = std::min(roiw4 - j, blockSize0) + j;
            v_sum = v_zero;
            v_sqsum = v_zero_f;

            for ( ; j + 16 < blockSize ; j += 16)
            {
                internal::prefetch(src + j);
                uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8);

                // 0
                uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0));
                uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0));
                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
                float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo);
                float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi);
                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);

                // 1
                v_srclo = vmovl_u16(vget_low_u16(v_src1));
                v_srchi = vmovl_u16(vget_high_u16(v_src1));
                v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi));
                v_srclo_f = vcvtq_f32_u32(v_srclo);
                v_srchi_f = vcvtq_f32_u32(v_srchi);
                v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f);
                v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f);
            }

            for ( ; j < blockSize; j += 4)
            {
                uint32x4_t v_src = vmovl_u16(vld1_u16(src + j));
                float32x4_t v_src_f = vcvtq_f32_u32(v_src);
                v_sum = vaddq_u32(v_sum, v_src);
                v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f);
            }

            vst1q_f32(arsum, vcvtq_f32_u32(v_sum));
            vst1q_f32(arsum + 4, v_sqsum);

            fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3];
            fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7];
        }

        // collect a few last elements in the current row
        for ( ; j < size.width; ++j)
        {
            f32 srcval = src[j];
            fsum += srcval;
            fsqsum += srcval * srcval;
        }
    }

    // calc mean and stddev
    f64 itotal = 1.0 / size.total();
    f64 mean = fsum * itotal;
    f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0));

    if (pMean)
        *pMean = mean;
    if (pStdDev)
        *pStdDev = stddev;
#else
    (void)size;
    (void)srcBase;
    (void)srcStride;
    (void)pMean;
    (void)pStdDev;
#endif
}
void AudioBlockPanStereoToStereo_NEON(
    const float aInputL[WEBAUDIO_BLOCK_SIZE],
    const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL[WEBAUDIO_BLOCK_SIZE],
    float aGainR[WEBAUDIO_BLOCK_SIZE],
    const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
    float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
  ASSERT_ALIGNED(aInputL);
  ASSERT_ALIGNED(aInputR);
  ASSERT_ALIGNED(aGainL);
  ASSERT_ALIGNED(aGainR);
  ASSERT_ALIGNED(aIsOnTheLeft);
  ASSERT_ALIGNED(aOutputL);
  ASSERT_ALIGNED(aOutputR);

  float32x4_t vinL0, vinL1;
  float32x4_t vinR0, vinR1;
  float32x4_t voutL0, voutL1;
  float32x4_t voutR0, voutR1;
  float32x4_t vscaleL0, vscaleL1;
  float32x4_t vscaleR0, vscaleR1;
  float32x4_t onleft0, onleft1, notonleft0, notonleft1;

  float32x4_t zero = vmovq_n_f32(0);
  uint8x8_t isOnTheLeft;

  // Although MSVC throws uninitialized value warning for voutL0 and voutL1,
  // since we fill all lanes by vsetq_lane_f32, we can ignore it. But to avoid
  // compiler warning, set zero.
  voutL0 = zero;
  voutL1 = zero;

  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE; i += 8) {
    vinL0 = vld1q_f32(ADDRESS_OF(aInputL, i));
    vinL1 = vld1q_f32(ADDRESS_OF(aInputL, i + 4));

    vinR0 = vld1q_f32(ADDRESS_OF(aInputR, i));
    vinR1 = vld1q_f32(ADDRESS_OF(aInputR, i + 4));

    vscaleL0 = vld1q_f32(ADDRESS_OF(aGainL, i));
    vscaleL1 = vld1q_f32(ADDRESS_OF(aGainL, i + 4));

    vscaleR0 = vld1q_f32(ADDRESS_OF(aGainR, i));
    vscaleR1 = vld1q_f32(ADDRESS_OF(aGainR, i + 4));

    // Load output with boolean "on the left" values. This assumes that
    // bools are stored as a single byte.
    isOnTheLeft = vld1_u8((uint8_t*)&aIsOnTheLeft[i]);
    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 0), voutL0, 0);
    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 1), voutL0, 1);
    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 2), voutL0, 2);
    voutL0 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 3), voutL0, 3);
    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 4), voutL1, 0);
    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 5), voutL1, 1);
    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 6), voutL1, 2);
    voutL1 = vsetq_lane_f32(vget_lane_u8(isOnTheLeft, 7), voutL1, 3);

    // Convert the boolean values into masks by setting all bits to 1
    // if true.
    voutL0 = (float32x4_t)vcgtq_f32(voutL0, zero);
    voutL1 = (float32x4_t)vcgtq_f32(voutL1, zero);

    // The right output masks are the same as the left masks
    voutR0 = voutL0;
    voutR1 = voutL1;

    // Calculate left channel assuming isOnTheLeft
    onleft0 = vmlaq_f32(vinL0, vinR0, vscaleL0);
    onleft1 = vmlaq_f32(vinL1, vinR1, vscaleL0);

    // Calculate left channel assuming not isOnTheLeft
    notonleft0 = vmulq_f32(vinL0, vscaleL0);
    notonleft1 = vmulq_f32(vinL1, vscaleL1);

    // Write results using previously stored masks
    voutL0 = vbslq_f32((uint32x4_t)voutL0, onleft0, notonleft0);
    voutL1 = vbslq_f32((uint32x4_t)voutL1, onleft1, notonleft1);

    // Calculate right channel assuming isOnTheLeft
    onleft0 = vmulq_f32(vinR0, vscaleR0);
    onleft1 = vmulq_f32(vinR1, vscaleR1);

    // Calculate right channel assuming not isOnTheLeft
    notonleft0 = vmlaq_f32(vinR0, vinL0, vscaleR0);
    notonleft1 = vmlaq_f32(vinR1, vinL1, vscaleR1);

    // Write results using previously stored masks
    voutR0 = vbslq_f32((uint32x4_t)voutR0, onleft0, notonleft0);
    voutR1 = vbslq_f32((uint32x4_t)voutR1, onleft1, notonleft1);

    vst1q_f32(ADDRESS_OF(aOutputL, i), voutL0);
    vst1q_f32(ADDRESS_OF(aOutputL, i + 4), voutL1);
    vst1q_f32(ADDRESS_OF(aOutputR, i), voutR0);
    vst1q_f32(ADDRESS_OF(aOutputR, i + 4), voutR1);
  }
}
Esempio n. 25
0
// use ARM Neon extensions
//
// (a + jb)(c + jd) = (ac - bd) + j(ad + bc)
//
// mm_x  = { x[0].real, x[0].imag, x[1].real, x[1].imag }
// mm_hi = { h[0].real, h[0].real, h[1].real, h[1].real }
// mm_hq = { h[0].imag, h[0].imag, h[1].imag, h[1].imag }
//
// mm_y0 = mm_x * mm_hi
//       = { x[0].real * h[0].real,
//           x[0].imag * h[0].real,
//           x[1].real * h[1].real,
//           x[1].imag * h[1].real };
//
// mm_y1 = mm_x * mm_hq
//       = { x[0].real * h[0].imag,
//           x[0].imag * h[0].imag,
//           x[1].real * h[1].imag,
//           x[1].imag * h[1].imag };
//
void dotprod_cccf_execute_neon(dotprod_cccf    _q,
                               float complex * _x,
                               float complex * _y)
{
    // type cast input as floating point array
    float * x = (float*) _x;

    // double effective length
    unsigned int n = 2*_q->n;

    // temporary buffers
    float32x4_t v;   // input vector
    float32x4_t hi;  // coefficients vector (real)
    float32x4_t hq;  // coefficients vector (imag)
    float32x4_t ci;  // output multiplication (v * hi)
    float32x4_t cq;  // output multiplication (v * hq)

    // output accumulators
    float zeros[4] = {0,0,0,0};
    float32x4_t sumi = vld1q_f32(zeros);
    float32x4_t sumq = vld1q_f32(zeros);

    // t = 4*(floor(_n/4))
    unsigned int t = (n >> 2) << 2;

    //
    unsigned int i;
    for (i=0; i<t; i+=4) {
        // load inputs into register (unaligned)
        // {x[0].real, x[0].imag, x[1].real, x[1].imag}
        v = vld1q_f32(&x[i]);

        // load coefficients into register (aligned)
        // {hi[0].real, hi[0].imag, hi[1].real, hi[1].imag}
        // {hq[0].real, hq[0].imag, hq[1].real, hq[1].imag}
        hi = vld1q_f32(&_q->hi[i]);
        hq = vld1q_f32(&_q->hq[i]);

        // compute parallel multiplications
        ci = vmulq_f32(v, hi);
        cq = vmulq_f32(v, hq);

        // parallel addition
        sumi = vaddq_f32(sumi, ci);
        sumq = vaddq_f32(sumq, cq);
    }

    // unload and combine
    float wi[4];
    float wq[4];
    vst1q_f32(wi, sumi);
    vst1q_f32(wq, sumq);

    // fold down (add/sub)
    float complex total = 
        ((wi[0] - wq[1]) + (wi[2] - wq[3])) +
        ((wi[1] + wq[0]) + (wi[3] + wq[2])) * _Complex_I;

    // cleanup
    for (i=t/2; i<_q->n; i++)
        total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I );

    // set return value
    *_y = total;
}
Esempio n. 26
0
int LRN_arm::forward_inplace(Mat& bottom_top_blob) const
{
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    // squared values with local_size padding
    Mat square_blob;
    square_blob.create(w, h, channels);
    if (square_blob.empty())
        return -100;

    #pragma omp parallel for
    for (int q=0; q<channels; q++)
    {
        const float* ptr = bottom_top_blob.channel(q);
        float* outptr = square_blob.channel(q);

#if __ARM_NEON
        int nn = size >> 2;
        int remain = size - (nn << 2);
#else
        int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
        for (; nn>0; nn--)
        {
            float32x4_t _p = vld1q_f32(ptr);
            float32x4_t _outp = vmulq_f32(_p, _p);
            vst1q_f32(outptr, _outp);

            ptr += 4;
            outptr += 4;
        }
#endif // __ARM_NEON
        for (; remain>0; remain--)
        {
            *outptr = *ptr * *ptr;

            ptr++;
            outptr++;
        }
    }

    if (region_type == NormRegion_ACROSS_CHANNELS)
    {
        Mat square_sum;
        square_sum.create(w, h, channels);
        if (square_sum.empty())
            return -100;
        square_sum.fill(0.f);

        const float alpha_div_size = alpha / local_size;

        #pragma omp parallel for
        for (int q=0; q<channels; q++)
        {
            // square sum
            for (int p=q - local_size / 2; p<=q + local_size / 2; p++)
            {
                if (p < 0 || p >= channels)
                    continue;

                const float* sptr = square_blob.channel(p);
                float* ssptr = square_sum.channel(q);

#if __ARM_NEON
                int nn = size >> 2;
                int remain = size - (nn << 2);
#else
                int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
                for (; nn>0; nn--)
                {
                    float32x4_t _sp = vld1q_f32(sptr);
                    float32x4_t _ssp = vld1q_f32(ssptr);
                    _ssp = vaddq_f32(_ssp, _sp);
                    vst1q_f32(ssptr, _ssp);

                    sptr += 4;
                    ssptr += 4;
                }
#endif // __ARM_NEON
                for (; remain>0; remain--)
                {
                    *ssptr += *sptr;
                    sptr++;
                    ssptr++;
                }
            }

            float* ptr = bottom_top_blob.channel(q);
            float* ssptr = square_sum.channel(q);

#if __ARM_NEON
            int nn = size >> 2;
            int remain = size - (nn << 2);
#else
            int remain = size;
#endif // __ARM_NEON

#if __ARM_NEON
            float32x4_t _bias = vdupq_n_f32(bias);
            float32x4_t _ads = vdupq_n_f32(alpha_div_size);
            float32x4_t _mb = vdupq_n_f32(-beta);
            for (; nn>0; nn--)
            {
                float32x4_t _p = vld1q_f32(ptr);
                float32x4_t _ssp = vld1q_f32(ssptr);
                _ssp = vmulq_f32(_ssp, _ads);
                _ssp = vaddq_f32(_ssp, _bias);
                _ssp = pow_ps(_ssp, _mb);
                _p = vmulq_f32(_p, _ssp);
                vst1q_f32(ptr, _p);

                ssptr += 4;
                ptr += 4;
            }
#endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                *ptr = *ptr * pow(bias + alpha_div_size * *ssptr, -beta);

                ssptr++;
                ptr++;
            }
        }
    }
Esempio n. 27
0
void byte2float48_neon(const uint8_t *t, const int pitch, float *p) {
    uint16x8_t m0, m1, m2, m3, m4, m5;
    uint32x2_t temp1, temp4;

    m0 = vmovl_u8(vld1_u8(t));
    temp1 = vld1_lane_u32((const uint32_t *)(t + 8), temp1, 0);
    temp1 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp1, 1);
    m1 = vmovl_u8(vreinterpret_u8_u32(temp1));
    m2 = vmovl_u8(vld1_u8(t + pitch * 2 + 4));

    t += pitch * 4;

    m3 = vmovl_u8(vld1_u8(t));
    temp4 = vld1_lane_u32((const uint32_t *)(t + 8), temp4, 0);
    temp4 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp4, 1);
    m4 = vmovl_u8(vreinterpret_u8_u32(temp4));
    m5 = vmovl_u8(vld1_u8(t + pitch * 2 + 4));

    vst1q_f32(p, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m0))));
    vst1q_f32(p + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m0))));
    vst1q_f32(p + 8, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m1))));
    vst1q_f32(p + 12, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m1))));
    vst1q_f32(p + 16, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m2))));
    vst1q_f32(p + 20, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m2))));
    vst1q_f32(p + 24, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m3))));
    vst1q_f32(p + 28, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m3))));
    vst1q_f32(p + 32, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m4))));
    vst1q_f32(p + 36, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m4))));
    vst1q_f32(p + 40, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m5))));
    vst1q_f32(p + 44, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m5))));
}
Esempio n. 28
0
void phase(const Size2D &size,
           const f32 * src0Base, ptrdiff_t src0Stride,
           const f32 * src1Base, ptrdiff_t src1Stride,
           f32 * dstBase, ptrdiff_t dstStride,
           f32 scale)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    FASTATAN2CONST(scale)
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;

    for (size_t i = 0; i < size.height; ++i)
    {
        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, i);
        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, i);
        f32 * dst = internal::getRowPtr(dstBase, dstStride, i);
        size_t j = 0;

        for (; j < roiw8; j += 8)
        {
            internal::prefetch(src0 + j);
            internal::prefetch(src1 + j);

            float32x4_t v_src00 = vld1q_f32(src0 + j), v_src01 = vld1q_f32(src0 + j + 4);
            float32x4_t v_src10 = vld1q_f32(src1 + j), v_src11 = vld1q_f32(src1 + j + 4);

            float32x4_t v_dst32f;
            // 0
            FASTATAN2VECTOR(v_src10, v_src00, v_dst32f)
            vst1q_f32(dst + j,     v_dst32f);
            // 1
            FASTATAN2VECTOR(v_src11, v_src01, v_dst32f)
            vst1q_f32(dst + j + 4, v_dst32f);
        }
        if(j + 4 <= size.width)
        {
            float32x4_t v_src0 = vld1q_f32(src0 + j);
            float32x4_t v_src1 = vld1q_f32(src1 + j);

            float32x4_t v_dst32f;
            FASTATAN2VECTOR(v_src1, v_src0, v_dst32f)
            vst1q_f32(dst + j, v_dst32f);
            j += 4;
        }

        for (; j < size.width; j++)
        {
            f32 a;
            FASTATAN2SCALAR(src1[j], src0[j], a)
            dst[j] = a;
        }
    }
#else
    (void)size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;
    (void)dstBase;
    (void)dstStride;
    (void)scale;
#endif
}
Esempio n. 29
0
void nnp_conv1x1_only_4x4__neon(
	size_t input_channels,
	size_t image_size,
	const float* input,
	const float* kernel,
	float* output)
{
	const float* input0 = input;
	const float* input1 = input0 + image_size;
	const float* input2 = input1 + image_size;
	const float* input3 = input2 + image_size;

	const float32x4_t vkernel0x = vld1q_f32(kernel);
	kernel += input_channels;
	const float32x4_t vkernel1x = vld1q_f32(kernel);
	kernel += input_channels;
	const float32x4_t vkernel2x = vld1q_f32(kernel);
	kernel += input_channels;
	const float32x4_t vkernel3x = vld1q_f32(kernel);

	float* output0 = output;
	float* output1 = output0 + image_size;
	float* output2 = output1 + image_size;
	float* output3 = output2 + image_size;
	while (image_size >= 4) {
		float32x4_t voutput0 = vld1q_f32(output0);
		float32x4_t voutput1 = vld1q_f32(output1);
		float32x4_t voutput2 = vld1q_f32(output2);
		float32x4_t voutput3 = vld1q_f32(output3);

		const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4;
		voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4;
		voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
		voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
		voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
		voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

		const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4;
		voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
		voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
		voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
		voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

		const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4;
		voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
		voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
		voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
		voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));

		vst1q_f32(output0, voutput0); output0 += 4;
		vst1q_f32(output1, voutput1); output1 += 4;
		vst1q_f32(output2, voutput2); output2 += 4;
		vst1q_f32(output3, voutput3); output3 += 4;

		image_size -= 4;
	}
	if (image_size >= 2) {
		float32x2_t voutput0 = vld1_f32(output0);
		float32x2_t voutput1 = vld1_f32(output1);
		float32x2_t voutput2 = vld1_f32(output2);
		float32x2_t voutput3 = vld1_f32(output3);

		const float32x2_t vinput0 = vld1_f32(input0); input0 += 2;
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		const float32x2_t vinput1 = vld1_f32(input1); input1 += 2;
		voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

		const float32x2_t vinput2 = vld1_f32(input2); input2 += 2;
		voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

		const float32x2_t vinput3 = vld1_f32(input3); input3 += 2;
		voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));

		vst1_f32(output0, voutput0); output0 += 2;
		vst1_f32(output1, voutput1); output1 += 2;
		vst1_f32(output2, voutput2); output2 += 2;
		vst1_f32(output3, voutput3); output3 += 2;

		image_size -= 2;
	}
	if (image_size != 0) {
		float32x2_t voutput0 = vld1_dup_f32(output0);
		float32x2_t voutput1 = vld1_dup_f32(output1);
		float32x2_t voutput2 = vld1_dup_f32(output2);
		float32x2_t voutput3 = vld1_dup_f32(output3);

		const float32x2_t vinput0 = vld1_dup_f32(input0);
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		const float32x2_t vinput1 = vld1_dup_f32(input1);
		voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

		const float32x2_t vinput2 = vld1_dup_f32(input2);
		voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

		const float32x2_t vinput3 = vld1_dup_f32(input3);
		voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));

		vst1_lane_f32(output0, voutput0, 0);
		vst1_lane_f32(output1, voutput1, 0);
		vst1_lane_f32(output2, voutput2, 0);
		vst1_lane_f32(output3, voutput3, 0);
	}
}
 static forcedinline void storeU (Type* dest, ParallelType a) noexcept           { vst1q_f32 (dest, a); }