Beispiel #1
0
f64 dotProduct(const Size2D &_size,
               const f32 * src0Base, ptrdiff_t src0Stride,
               const f32 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
    {
        size.width *= size.height;
        size.height = 1;
    }

#define DOT_FLOAT_BLOCKSIZE (1 << 13)
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        while(i + 4 <= size.width)
        {
            size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
            float32x4_t v_sum = vdupq_n_f32(0.0f);

            for( ; i <= lim; i += 4 )
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);
                v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
            }

            float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
        }

        if(i + 2 <= size.width)
        {
            float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
            i += 2;
        }

        for (; i < size.width; ++i)
            result += src0[i] * src1[i];
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}
/*
 * Scale FFT data by 1/|length|. |length| must be a power of two
 */
static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) {
  float32_t* data = (float32_t*)fftData;
  float32_t scale = 2.0f / length;

  if (length >= 4) {
    /*
     * Do 4 float elements at a time because |length| is always a
     * multiple of 4 when |length| >= 4.
     *
     * TODO(rtoy): Figure out how to process 8 elements at a time
     * using intrinsics or replace this with inline assembly.
     */
    do {
      float32x4_t x = vld1q_f32(data);

      length -= 4;
      x = vmulq_n_f32(x, scale);
      vst1q_f32(data, x);
      data += 4;
    } while (length > 0);
  } else if (length == 2) {
    float32x2_t x = vld1_f32(data);
    x = vmul_n_f32(x, scale);
    vst1_f32(data, x);
  } else {
    fftData[0] *= scale;
  }
}
Beispiel #3
0
static inline void mix_block_into(cbox_sample_t **outputs, int oofs, float *src_leftright)
{
    float *dst_left = outputs[oofs];
    float *dst_right = outputs[oofs + 1];
    for (size_t i = 0; i < CBOX_BLOCK_SIZE; i += 2)
    {
        float32x2_t lr1 = vld1_f32(&src_leftright[2 * i]);
        float32x2_t lr2 = vld1_f32(&src_leftright[2 * i + 2]);
        float32x2x2_t lr12 = vtrn_f32(lr1, lr2);
        float32x2_t dl1 = vld1_f32(&dst_left[i]);
        float32x2_t dr1 = vld1_f32(&dst_right[i]);
        
        float32x2_t l1 = vadd_f32(dl1, lr12.val[0]);
        vst1_f32(&dst_left[i], l1);
        float32x2_t r1 = vadd_f32(dr1, lr12.val[1]);
        vst1_f32(&dst_right[i], r1);
    }
}
Beispiel #4
0
test_vdups_lane_f32 ()
{
  float32x2_t a;
  float32_t b;
  float32_t c[2] = { 0.0, 1.0 };

  a = vld1_f32 (c);
  b = wrap_vdups_lane_f32_0 (a, a);
  if (c[0] != b)
    return 1;
  b = wrap_vdups_lane_f32_1 (a);
  if (c[1] != b)
    return 1;
  return 0;
}
Beispiel #5
0
void
test_square_root_v2sf ()
{
  const float32_t pool[] = {4.0f, 9.0f};
  float32x2_t val;
  float32x2_t res;

  val = vld1_f32 (pool);
  res = vsqrt_f32 (val);

  if (vget_lane_f32 (res, 0) != 2.0f)
    abort ();
  if (vget_lane_f32 (res, 1) != 3.0f)
    abort ();
}
Beispiel #6
0
void
test_case (float32_t v1[2], float32_t v2[2], float32_t e1[2], float32_t e2[2])
{
  int i;
  float32x2_t vec1_1 = vld1_f32 (v1);
  float32x2_t vec1_2 = vld1_f32 (v2);


  float32x2_t actual1 = test_vmulx_lane0_f32 (vec1_1, vec1_2);
  float32_t actual1_1[2];
  vst1_f32 (actual1_1, actual1);

  for (i = 0; i < 2; ++i)
    if (actual1_1[i] != e1[i])
      abort ();

  float32x2_t actual2 = test_vmulx_lane1_f32 (vec1_1, vec1_2);
  float32_t actual2_1[2];
  vst1_f32 (actual2_1, actual2);

  for (i = 0; i < 2; ++i)
    if (actual2_1[i] != e2[i])
      abort ();
}
Beispiel #7
0
test_vreinterpret_f64_f32 ()
{
  float32x2_t a;
  float64x1_t b;
  /* Values { 0x54442D18, 0x400921FB } reinterpreted as f32.  */
  float32_t c[2] = { 3.3702805504E12, 2.1426990032196044921875E0 };
  float64_t d[1] = { PI_F64 };
  float64_t e[1];
  int i;

  a = vld1_f32 (c);
  b = wrap_vreinterpret_f64_f32 (a);
  vst1_f64 (e, b);
  if (!DOUBLE_EQUALS (d[0], e[0], __DBL_EPSILON__))
    return 1;
  return 0;
};
void processVirtualizerNeon(int16_t* buffer, uint32_t sizeInFrames) {
	effectsFramesBeforeRecoveringGain -= sizeInFrames;

	float32x2_t gainClip = vld1_f32(effectsGainClip);
	float32x2_t maxAbsSample = vdup_n_f32(0.0f);

	while ((sizeInFrames--)) {
		float *samples = equalizerSamples;

		effectsTemp[0] = (int32_t)buffer[0];
		effectsTemp[1] = (int32_t)buffer[1];
		//inLR = { L, R }
		float32x2_t inLR = vcvt_f32_s32(*((int32x2_t*)effectsTemp));

		virtualizerNeon();

		floatToShortNeon();
	}

	footerNeon();
}
Beispiel #9
0
test_vdupq_lane_f32 ()
{
  float32x2_t a;
  float32x4_t b;
  int i;
  float32_t c[2] = { 0.0 , 3.14 };
  float32_t d[4];

  a = vld1_f32 (c);
  b = wrap_vdupq_lane_f32_0 (a);
  vst1q_f32 (d, b);
  for (i = 0; i < 4; i++)
    if (c[0] != d[i])
      return 1;

  b = wrap_vdupq_lane_f32_1 (a);
  vst1q_f32 (d, b);
  for (i = 0; i < 4; i++)
    if (c[1] != d[i])
      return 1;
  return 0;
}
Beispiel #10
0
void nnp_conv1x1_only_4x4__neon(
	size_t input_channels,
	size_t image_size,
	const float* input,
	const float* kernel,
	float* output)
{
	const float* input0 = input;
	const float* input1 = input0 + image_size;
	const float* input2 = input1 + image_size;
	const float* input3 = input2 + image_size;

	const float32x4_t vkernel0x = vld1q_f32(kernel);
	kernel += input_channels;
	const float32x4_t vkernel1x = vld1q_f32(kernel);
	kernel += input_channels;
	const float32x4_t vkernel2x = vld1q_f32(kernel);
	kernel += input_channels;
	const float32x4_t vkernel3x = vld1q_f32(kernel);

	float* output0 = output;
	float* output1 = output0 + image_size;
	float* output2 = output1 + image_size;
	float* output3 = output2 + image_size;
	while (image_size >= 4) {
		float32x4_t voutput0 = vld1q_f32(output0);
		float32x4_t voutput1 = vld1q_f32(output1);
		float32x4_t voutput2 = vld1q_f32(output2);
		float32x4_t voutput3 = vld1q_f32(output3);

		const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4;
		voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4;
		voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
		voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
		voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
		voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

		const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4;
		voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
		voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
		voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
		voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

		const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4;
		voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
		voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
		voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
		voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));

		vst1q_f32(output0, voutput0); output0 += 4;
		vst1q_f32(output1, voutput1); output1 += 4;
		vst1q_f32(output2, voutput2); output2 += 4;
		vst1q_f32(output3, voutput3); output3 += 4;

		image_size -= 4;
	}
	if (image_size >= 2) {
		float32x2_t voutput0 = vld1_f32(output0);
		float32x2_t voutput1 = vld1_f32(output1);
		float32x2_t voutput2 = vld1_f32(output2);
		float32x2_t voutput3 = vld1_f32(output3);

		const float32x2_t vinput0 = vld1_f32(input0); input0 += 2;
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		const float32x2_t vinput1 = vld1_f32(input1); input1 += 2;
		voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

		const float32x2_t vinput2 = vld1_f32(input2); input2 += 2;
		voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

		const float32x2_t vinput3 = vld1_f32(input3); input3 += 2;
		voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));

		vst1_f32(output0, voutput0); output0 += 2;
		vst1_f32(output1, voutput1); output1 += 2;
		vst1_f32(output2, voutput2); output2 += 2;
		vst1_f32(output3, voutput3); output3 += 2;

		image_size -= 2;
	}
	if (image_size != 0) {
		float32x2_t voutput0 = vld1_dup_f32(output0);
		float32x2_t voutput1 = vld1_dup_f32(output1);
		float32x2_t voutput2 = vld1_dup_f32(output2);
		float32x2_t voutput3 = vld1_dup_f32(output3);

		const float32x2_t vinput0 = vld1_dup_f32(input0);
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		const float32x2_t vinput1 = vld1_dup_f32(input1);
		voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

		const float32x2_t vinput2 = vld1_dup_f32(input2);
		voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

		const float32x2_t vinput3 = vld1_dup_f32(input3);
		voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));

		vst1_lane_f32(output0, voutput0, 0);
		vst1_lane_f32(output1, voutput1, 0);
		vst1_lane_f32(output2, voutput2, 0);
		vst1_lane_f32(output3, voutput3, 0);
	}
}
Beispiel #11
0
void nnp_conv1x1_upto_4x4__neon(
	uint32_t input_channels_subblock_size,
	uint32_t output_channels_subblock_size,
	size_t input_channels,
	size_t image_size,
	const float* input,
	const float* kernel,
	float* output)
{
	const float*restrict input0 = input;
	const float*restrict input1 = input_channels_subblock_size > 1 ? input0 + image_size : input0;
	const float*restrict input2 = input_channels_subblock_size > 2 ? input1 + image_size : input1;
	const float*restrict input3 = input_channels_subblock_size > 3 ? input2 + image_size : input2;

	const float*restrict kernel0 = kernel;
	const float*restrict kernel1 = output_channels_subblock_size > 1 ? kernel0 + input_channels : kernel0;
	const float*restrict kernel2 = output_channels_subblock_size > 2 ? kernel1 + input_channels : kernel1;
	const float*restrict kernel3 = output_channels_subblock_size > 3 ? kernel2 + input_channels : kernel2;

	float32x4_t vkernel0x = vld1q_dup_f32(kernel0);
	float32x4_t vkernel1x = vld1q_dup_f32(kernel1);
	float32x4_t vkernel2x = vld1q_dup_f32(kernel2);
	float32x4_t vkernel3x = vld1q_dup_f32(kernel3);
	if (input_channels_subblock_size > 1) {
		vkernel0x = vld1q_lane_f32(kernel0 + 1, vkernel0x, 1);
		vkernel1x = vld1q_lane_f32(kernel1 + 1, vkernel1x, 1);
		vkernel2x = vld1q_lane_f32(kernel2 + 1, vkernel2x, 1);
		vkernel3x = vld1q_lane_f32(kernel3 + 1, vkernel3x, 1);
		if (input_channels_subblock_size > 2) {
			vkernel0x = vld1q_lane_f32(kernel0 + 2, vkernel0x, 2);
			vkernel1x = vld1q_lane_f32(kernel1 + 2, vkernel1x, 2);
			vkernel2x = vld1q_lane_f32(kernel2 + 2, vkernel2x, 2);
			vkernel3x = vld1q_lane_f32(kernel3 + 2, vkernel3x, 2);
			if (input_channels_subblock_size > 3) {
				vkernel0x = vld1q_lane_f32(kernel0 + 3, vkernel0x, 3);
				vkernel1x = vld1q_lane_f32(kernel1 + 3, vkernel1x, 3);
				vkernel2x = vld1q_lane_f32(kernel2 + 3, vkernel2x, 3);
				vkernel3x = vld1q_lane_f32(kernel3 + 3, vkernel3x, 3);
			}
		}
	}

	float*restrict output0 = output;
	float*restrict output1 = output_channels_subblock_size > 1 ? output0 + image_size : output0;
	float*restrict output2 = output_channels_subblock_size > 2 ? output1 + image_size : output1;
	float*restrict output3 = output_channels_subblock_size > 3 ? output2 + image_size : output2;
	while (image_size >= 4) {
		float32x4_t voutput0 = vld1q_f32(output0);
		float32x4_t voutput1 = vld1q_f32(output1);
		float32x4_t voutput2 = vld1q_f32(output2);
		float32x4_t voutput3 = vld1q_f32(output3);

		const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4;
		voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		if (input_channels_subblock_size > 1) {
			const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4;
			voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
			voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
			voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
			voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

			if (input_channels_subblock_size > 2) {
				const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4;
				voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
				voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
				voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
				voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

				if (input_channels_subblock_size > 3) {
					const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4;
					voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
					voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
					voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
					voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));
				}
			}
		}

		vst1q_f32(output0, voutput0); output0 += 4;
		if (output_channels_subblock_size > 1) {
			vst1q_f32(output1, voutput1); output1 += 4;
			if (output_channels_subblock_size > 2) {
				vst1q_f32(output2, voutput2); output2 += 4;
				if (output_channels_subblock_size > 3) {
					vst1q_f32(output3, voutput3); output3 += 4;
				}
			}
		}

		image_size -= 4;
	}
	if (image_size >= 2) {
		float32x2_t voutput0 = vld1_f32(output0);
		float32x2_t voutput1 = vld1_f32(output1);
		float32x2_t voutput2 = vld1_f32(output2);
		float32x2_t voutput3 = vld1_f32(output3);

		const float32x2_t vinput0 = vld1_f32(input0); input0 += 2;
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		if (input_channels_subblock_size > 1) {
			const float32x2_t vinput1 = vld1_f32(input1); input1 += 2;
			voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
			voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
			voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
			voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

			if (input_channels_subblock_size > 2) {
				const float32x2_t vinput2 = vld1_f32(input2); input2 += 2;
				voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
				voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
				voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
				voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

				if (input_channels_subblock_size > 3) {
					const float32x2_t vinput3 = vld1_f32(input3); input3 += 2;
					voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
					voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
					voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
					voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));
				}
			}
		}

		vst1_f32(output0, voutput0); output0 += 2;
		if (output_channels_subblock_size > 1) {
			vst1_f32(output1, voutput1); output1 += 2;
			if (output_channels_subblock_size > 2) {
				vst1_f32(output2, voutput2); output2 += 2;
				if (output_channels_subblock_size > 3) {
					vst1_f32(output3, voutput3); output3 += 2;
				}
			}
		}

		image_size -= 2;
	}
	if (image_size != 0) {
		float32x2_t voutput0 = vld1_dup_f32(output0);
		float32x2_t voutput1 = vld1_dup_f32(output1);
		float32x2_t voutput2 = vld1_dup_f32(output2);
		float32x2_t voutput3 = vld1_dup_f32(output3);

		const float32x2_t vinput0 = vld1_dup_f32(input0);
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		if (input_channels_subblock_size > 1) {
			const float32x2_t vinput1 = vld1_dup_f32(input1);
			voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
			voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
			voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
			voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

			if (input_channels_subblock_size > 2) {
				const float32x2_t vinput2 = vld1_dup_f32(input2);
				voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
				voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
				voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
				voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

				if (input_channels_subblock_size > 3) {
					const float32x2_t vinput3 = vld1_dup_f32(input3);
					voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
					voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
					voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
					voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));
				}
			}
		}

		vst1_lane_f32(output0, voutput0, 0);
		if (output_channels_subblock_size > 1) {
			vst1_lane_f32(output1, voutput1, 0);
			if (output_channels_subblock_size > 2) {
				vst1_lane_f32(output2, voutput2, 0);
				if (output_channels_subblock_size > 3) {
					vst1_lane_f32(output3, voutput3, 0);
				}
			}
		}
	}
}
Beispiel #12
0
void test_vld1f32 (void)
{
  float32x2_t out_float32x2_t;

  out_float32x2_t = vld1_f32 (0);
}
Beispiel #13
0
static void cftmdl_128_neon(float* a) {
  int j;
  const int l = 8;
  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
  float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);

  for (j = 0; j < l; j += 2) {
    const float32x2_t a_00 = vld1_f32(&a[j + 0]);
    const float32x2_t a_08 = vld1_f32(&a[j + 8]);
    const float32x2_t a_32 = vld1_f32(&a[j + 32]);
    const float32x2_t a_40 = vld1_f32(&a[j + 40]);
    const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
    const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
    const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
    const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
    const float32x2_t a_16 = vld1_f32(&a[j + 16]);
    const float32x2_t a_24 = vld1_f32(&a[j + 24]);
    const float32x2_t a_48 = vld1_f32(&a[j + 48]);
    const float32x2_t a_56 = vld1_f32(&a[j + 56]);
    const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
    const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
    const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
    const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
    const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
    const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
    const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
    const float32x4_t x1_x3_add =
        vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
    const float32x4_t x1_x3_sub =
        vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
    const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
    const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
    const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
    const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
    const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
    const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
    const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
    const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
    const float32x4_t xx1_rev = vrev64q_f32(xx1);
    const float32x4_t yy4_rev = vrev64q_f32(yy4);

    vst1_f32(&a[j + 0], vget_low_f32(xx0));
    vst1_f32(&a[j + 32], vget_high_f32(xx0));
    vst1_f32(&a[j + 16], vget_low_f32(xx1));
    vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));

    a[j + 48] = -a[j + 48];

    vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
    vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
    vst1_f32(&a[j + 40], vget_low_f32(yy4));
    vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
  }

  {
    const int k = 64;
    const int k1 = 2;
    const int k2 = 2 * k1;
    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
    wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
    for (j = k; j < l + k; j += 2) {
      const float32x2_t a_00 = vld1_f32(&a[j + 0]);
      const float32x2_t a_08 = vld1_f32(&a[j + 8]);
      const float32x2_t a_32 = vld1_f32(&a[j + 32]);
      const float32x2_t a_40 = vld1_f32(&a[j + 40]);
      const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
      const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
      const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
      const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
      const float32x2_t a_16 = vld1_f32(&a[j + 16]);
      const float32x2_t a_24 = vld1_f32(&a[j + 24]);
      const float32x2_t a_48 = vld1_f32(&a[j + 48]);
      const float32x2_t a_56 = vld1_f32(&a[j + 56]);
      const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
      const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
      const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
      const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
      const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
      const float32x4_t x1_x3_add =
          vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
      const float32x4_t x1_x3_sub =
          vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
      float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
      float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
      float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
      xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
      xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
      xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));

      vst1_f32(&a[j + 0], vget_low_f32(xx));
      vst1_f32(&a[j + 32], vget_high_f32(xx));
      vst1_f32(&a[j + 16], vget_low_f32(xx4));
      vst1_f32(&a[j + 48], vget_high_f32(xx4));
      vst1_f32(&a[j + 8], vget_low_f32(xx12));
      vst1_f32(&a[j + 40], vget_high_f32(xx12));
      vst1_f32(&a[j + 24], vget_low_f32(xx22));
      vst1_f32(&a[j + 56], vget_high_f32(xx22));
    }
  }
}
Beispiel #14
0
inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); }