Example #1
0
test_vdup_lane_f32 ()
{
  float32x2_t a;
  float32x2_t b;
  int i;
  float32_t c[2] = { 0.0 , 3.14 };
  float32_t d[2];

  a = vld1_f32 (c);
  b = wrap_vdup_lane_f32_0 (a);
  vst1_f32 (d, b);
  for (i = 0; i < 2; i++)
    if (c[0] != d[i])
      return 1;

  b = wrap_vdup_lane_f32_1 (a);
  vst1_f32 (d, b);
  for (i = 0; i < 2; i++)
    if (c[1] != d[i])
      return 1;
  return 0;
}
Example #2
0
void
test_case (float32_t v1[2], float32_t v2[2], float32_t e1[2], float32_t e2[2])
{
  int i;
  float32x2_t vec1_1 = vld1_f32 (v1);
  float32x2_t vec1_2 = vld1_f32 (v2);


  float32x2_t actual1 = test_vmulx_lane0_f32 (vec1_1, vec1_2);
  float32_t actual1_1[2];
  vst1_f32 (actual1_1, actual1);

  for (i = 0; i < 2; ++i)
    if (actual1_1[i] != e1[i])
      abort ();

  float32x2_t actual2 = test_vmulx_lane1_f32 (vec1_1, vec1_2);
  float32_t actual2_1[2];
  vst1_f32 (actual2_1, actual2);

  for (i = 0; i < 2; ++i)
    if (actual2_1[i] != e2[i])
      abort ();
}
Example #3
0
test_vreinterpret_f32_f64 ()
{
  float64x1_t a;
  float32x2_t b;
  float64_t c[1] = { PI_F64 };
  /* Values { 0x54442D18, 0x400921FB } reinterpreted as f32.  */
  float32_t d[2] = { 3.3702805504E12, 2.1426990032196044921875E0 };
  float32_t e[2];
  int i;

  a = vld1_f64 (c);
  b = wrap_vreinterpret_f32_f64 (a);
  vst1_f32 (e, b);
  for (i = 0; i < 2; i++)
    if (!DOUBLE_EQUALS (d[i], e[i], __FLT_EPSILON__))
      return 1;
  return 0;
};
Example #4
0
void dot_loop(const GLfloat *verts, const GLfloat *params, GLfloat *out, GLint count) {
#ifdef __ARM_NEON__
    float32x2_t acc;
    float32x2x3_t vert;
    float32x2x3_t param = vld3_f32((const float32_t *)params);
    for (; count != 0; count -= 1) {
        vert = vld3_f32((const float32_t *)verts);
        acc = vmul_f32(vert.val[0], param.val[0]);
        acc = vmla_f32(acc, vert.val[1], param.val[1]);
        acc = vmla_f32(acc, vert.val[2], param.val[2]);
        vst1_f32((float32_t *)out, acc);

        out += 2;
        verts += 3;
    }
#else
    for (int i = 0; i < count; i++) {
        out[0] = dot(verts, params);
        out += 2;
        verts += 3;
    }
#endif
}
Example #5
0
void nnp_conv1x1_only_4x4__neon(
	size_t input_channels,
	size_t image_size,
	const float* input,
	const float* kernel,
	float* output)
{
	const float* input0 = input;
	const float* input1 = input0 + image_size;
	const float* input2 = input1 + image_size;
	const float* input3 = input2 + image_size;

	const float32x4_t vkernel0x = vld1q_f32(kernel);
	kernel += input_channels;
	const float32x4_t vkernel1x = vld1q_f32(kernel);
	kernel += input_channels;
	const float32x4_t vkernel2x = vld1q_f32(kernel);
	kernel += input_channels;
	const float32x4_t vkernel3x = vld1q_f32(kernel);

	float* output0 = output;
	float* output1 = output0 + image_size;
	float* output2 = output1 + image_size;
	float* output3 = output2 + image_size;
	while (image_size >= 4) {
		float32x4_t voutput0 = vld1q_f32(output0);
		float32x4_t voutput1 = vld1q_f32(output1);
		float32x4_t voutput2 = vld1q_f32(output2);
		float32x4_t voutput3 = vld1q_f32(output3);

		const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4;
		voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4;
		voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
		voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
		voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
		voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

		const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4;
		voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
		voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
		voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
		voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

		const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4;
		voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
		voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
		voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
		voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));

		vst1q_f32(output0, voutput0); output0 += 4;
		vst1q_f32(output1, voutput1); output1 += 4;
		vst1q_f32(output2, voutput2); output2 += 4;
		vst1q_f32(output3, voutput3); output3 += 4;

		image_size -= 4;
	}
	if (image_size >= 2) {
		float32x2_t voutput0 = vld1_f32(output0);
		float32x2_t voutput1 = vld1_f32(output1);
		float32x2_t voutput2 = vld1_f32(output2);
		float32x2_t voutput3 = vld1_f32(output3);

		const float32x2_t vinput0 = vld1_f32(input0); input0 += 2;
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		const float32x2_t vinput1 = vld1_f32(input1); input1 += 2;
		voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

		const float32x2_t vinput2 = vld1_f32(input2); input2 += 2;
		voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

		const float32x2_t vinput3 = vld1_f32(input3); input3 += 2;
		voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));

		vst1_f32(output0, voutput0); output0 += 2;
		vst1_f32(output1, voutput1); output1 += 2;
		vst1_f32(output2, voutput2); output2 += 2;
		vst1_f32(output3, voutput3); output3 += 2;

		image_size -= 2;
	}
	if (image_size != 0) {
		float32x2_t voutput0 = vld1_dup_f32(output0);
		float32x2_t voutput1 = vld1_dup_f32(output1);
		float32x2_t voutput2 = vld1_dup_f32(output2);
		float32x2_t voutput3 = vld1_dup_f32(output3);

		const float32x2_t vinput0 = vld1_dup_f32(input0);
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		const float32x2_t vinput1 = vld1_dup_f32(input1);
		voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

		const float32x2_t vinput2 = vld1_dup_f32(input2);
		voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

		const float32x2_t vinput3 = vld1_dup_f32(input3);
		voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
		voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
		voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
		voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));

		vst1_lane_f32(output0, voutput0, 0);
		vst1_lane_f32(output1, voutput1, 0);
		vst1_lane_f32(output2, voutput2, 0);
		vst1_lane_f32(output3, voutput3, 0);
	}
}
Example #6
0
void nnp_conv1x1_upto_4x4__neon(
	uint32_t input_channels_subblock_size,
	uint32_t output_channels_subblock_size,
	size_t input_channels,
	size_t image_size,
	const float* input,
	const float* kernel,
	float* output)
{
	const float*restrict input0 = input;
	const float*restrict input1 = input_channels_subblock_size > 1 ? input0 + image_size : input0;
	const float*restrict input2 = input_channels_subblock_size > 2 ? input1 + image_size : input1;
	const float*restrict input3 = input_channels_subblock_size > 3 ? input2 + image_size : input2;

	const float*restrict kernel0 = kernel;
	const float*restrict kernel1 = output_channels_subblock_size > 1 ? kernel0 + input_channels : kernel0;
	const float*restrict kernel2 = output_channels_subblock_size > 2 ? kernel1 + input_channels : kernel1;
	const float*restrict kernel3 = output_channels_subblock_size > 3 ? kernel2 + input_channels : kernel2;

	float32x4_t vkernel0x = vld1q_dup_f32(kernel0);
	float32x4_t vkernel1x = vld1q_dup_f32(kernel1);
	float32x4_t vkernel2x = vld1q_dup_f32(kernel2);
	float32x4_t vkernel3x = vld1q_dup_f32(kernel3);
	if (input_channels_subblock_size > 1) {
		vkernel0x = vld1q_lane_f32(kernel0 + 1, vkernel0x, 1);
		vkernel1x = vld1q_lane_f32(kernel1 + 1, vkernel1x, 1);
		vkernel2x = vld1q_lane_f32(kernel2 + 1, vkernel2x, 1);
		vkernel3x = vld1q_lane_f32(kernel3 + 1, vkernel3x, 1);
		if (input_channels_subblock_size > 2) {
			vkernel0x = vld1q_lane_f32(kernel0 + 2, vkernel0x, 2);
			vkernel1x = vld1q_lane_f32(kernel1 + 2, vkernel1x, 2);
			vkernel2x = vld1q_lane_f32(kernel2 + 2, vkernel2x, 2);
			vkernel3x = vld1q_lane_f32(kernel3 + 2, vkernel3x, 2);
			if (input_channels_subblock_size > 3) {
				vkernel0x = vld1q_lane_f32(kernel0 + 3, vkernel0x, 3);
				vkernel1x = vld1q_lane_f32(kernel1 + 3, vkernel1x, 3);
				vkernel2x = vld1q_lane_f32(kernel2 + 3, vkernel2x, 3);
				vkernel3x = vld1q_lane_f32(kernel3 + 3, vkernel3x, 3);
			}
		}
	}

	float*restrict output0 = output;
	float*restrict output1 = output_channels_subblock_size > 1 ? output0 + image_size : output0;
	float*restrict output2 = output_channels_subblock_size > 2 ? output1 + image_size : output1;
	float*restrict output3 = output_channels_subblock_size > 3 ? output2 + image_size : output2;
	while (image_size >= 4) {
		float32x4_t voutput0 = vld1q_f32(output0);
		float32x4_t voutput1 = vld1q_f32(output1);
		float32x4_t voutput2 = vld1q_f32(output2);
		float32x4_t voutput3 = vld1q_f32(output3);

		const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4;
		voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		if (input_channels_subblock_size > 1) {
			const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4;
			voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
			voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
			voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
			voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

			if (input_channels_subblock_size > 2) {
				const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4;
				voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
				voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
				voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
				voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

				if (input_channels_subblock_size > 3) {
					const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4;
					voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
					voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
					voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
					voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));
				}
			}
		}

		vst1q_f32(output0, voutput0); output0 += 4;
		if (output_channels_subblock_size > 1) {
			vst1q_f32(output1, voutput1); output1 += 4;
			if (output_channels_subblock_size > 2) {
				vst1q_f32(output2, voutput2); output2 += 4;
				if (output_channels_subblock_size > 3) {
					vst1q_f32(output3, voutput3); output3 += 4;
				}
			}
		}

		image_size -= 4;
	}
	if (image_size >= 2) {
		float32x2_t voutput0 = vld1_f32(output0);
		float32x2_t voutput1 = vld1_f32(output1);
		float32x2_t voutput2 = vld1_f32(output2);
		float32x2_t voutput3 = vld1_f32(output3);

		const float32x2_t vinput0 = vld1_f32(input0); input0 += 2;
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		if (input_channels_subblock_size > 1) {
			const float32x2_t vinput1 = vld1_f32(input1); input1 += 2;
			voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
			voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
			voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
			voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

			if (input_channels_subblock_size > 2) {
				const float32x2_t vinput2 = vld1_f32(input2); input2 += 2;
				voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
				voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
				voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
				voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

				if (input_channels_subblock_size > 3) {
					const float32x2_t vinput3 = vld1_f32(input3); input3 += 2;
					voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
					voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
					voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
					voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));
				}
			}
		}

		vst1_f32(output0, voutput0); output0 += 2;
		if (output_channels_subblock_size > 1) {
			vst1_f32(output1, voutput1); output1 += 2;
			if (output_channels_subblock_size > 2) {
				vst1_f32(output2, voutput2); output2 += 2;
				if (output_channels_subblock_size > 3) {
					vst1_f32(output3, voutput3); output3 += 2;
				}
			}
		}

		image_size -= 2;
	}
	if (image_size != 0) {
		float32x2_t voutput0 = vld1_dup_f32(output0);
		float32x2_t voutput1 = vld1_dup_f32(output1);
		float32x2_t voutput2 = vld1_dup_f32(output2);
		float32x2_t voutput3 = vld1_dup_f32(output3);

		const float32x2_t vinput0 = vld1_dup_f32(input0);
		voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x));
		voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x));
		voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x));
		voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x));

		if (input_channels_subblock_size > 1) {
			const float32x2_t vinput1 = vld1_dup_f32(input1);
			voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x));
			voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x));
			voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x));
			voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x));

			if (input_channels_subblock_size > 2) {
				const float32x2_t vinput2 = vld1_dup_f32(input2);
				voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x));
				voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x));
				voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x));
				voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x));

				if (input_channels_subblock_size > 3) {
					const float32x2_t vinput3 = vld1_dup_f32(input3);
					voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x));
					voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x));
					voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x));
					voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x));
				}
			}
		}

		vst1_lane_f32(output0, voutput0, 0);
		if (output_channels_subblock_size > 1) {
			vst1_lane_f32(output1, voutput1, 0);
			if (output_channels_subblock_size > 2) {
				vst1_lane_f32(output2, voutput2, 0);
				if (output_channels_subblock_size > 3) {
					vst1_lane_f32(output3, voutput3, 0);
				}
			}
		}
	}
}
Example #7
0
static void cftmdl_128_neon(float* a) {
  int j;
  const int l = 8;
  const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign);
  float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r);

  for (j = 0; j < l; j += 2) {
    const float32x2_t a_00 = vld1_f32(&a[j + 0]);
    const float32x2_t a_08 = vld1_f32(&a[j + 8]);
    const float32x2_t a_32 = vld1_f32(&a[j + 32]);
    const float32x2_t a_40 = vld1_f32(&a[j + 40]);
    const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
    const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
    const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
    const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
    const float32x2_t a_16 = vld1_f32(&a[j + 16]);
    const float32x2_t a_24 = vld1_f32(&a[j + 24]);
    const float32x2_t a_48 = vld1_f32(&a[j + 48]);
    const float32x2_t a_56 = vld1_f32(&a[j + 56]);
    const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
    const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
    const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
    const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
    const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
    const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
    const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
    const float32x4_t x1_x3_add =
        vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
    const float32x4_t x1_x3_sub =
        vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
    const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0);
    const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0);
    const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s);
    const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1);
    const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1);
    const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s);
    const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as);
    const float32x4_t yy4 = vmulq_f32(wk1rv, yy0);
    const float32x4_t xx1_rev = vrev64q_f32(xx1);
    const float32x4_t yy4_rev = vrev64q_f32(yy4);

    vst1_f32(&a[j + 0], vget_low_f32(xx0));
    vst1_f32(&a[j + 32], vget_high_f32(xx0));
    vst1_f32(&a[j + 16], vget_low_f32(xx1));
    vst1_f32(&a[j + 48], vget_high_f32(xx1_rev));

    a[j + 48] = -a[j + 48];

    vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add));
    vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub));
    vst1_f32(&a[j + 40], vget_low_f32(yy4));
    vst1_f32(&a[j + 56], vget_high_f32(yy4_rev));
  }

  {
    const int k = 64;
    const int k1 = 2;
    const int k2 = 2 * k1;
    const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]);
    const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]);
    const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]);
    const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]);
    const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]);
    wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]);
    for (j = k; j < l + k; j += 2) {
      const float32x2_t a_00 = vld1_f32(&a[j + 0]);
      const float32x2_t a_08 = vld1_f32(&a[j + 8]);
      const float32x2_t a_32 = vld1_f32(&a[j + 32]);
      const float32x2_t a_40 = vld1_f32(&a[j + 40]);
      const float32x4_t a_00_32 = vcombine_f32(a_00, a_32);
      const float32x4_t a_08_40 = vcombine_f32(a_08, a_40);
      const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40);
      const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40);
      const float32x2_t a_16 = vld1_f32(&a[j + 16]);
      const float32x2_t a_24 = vld1_f32(&a[j + 24]);
      const float32x2_t a_48 = vld1_f32(&a[j + 48]);
      const float32x2_t a_56 = vld1_f32(&a[j + 56]);
      const float32x4_t a_16_48 = vcombine_f32(a_16, a_48);
      const float32x4_t a_24_56 = vcombine_f32(a_24, a_56);
      const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56);
      const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56);
      const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1);
      const float32x4_t x1_x3_add =
          vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
      const float32x4_t x1_x3_sub =
          vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1);
      float32x4_t xx4 = vmulq_f32(wk2rv, xx1);
      float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add);
      float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub);
      xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1));
      xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add));
      xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub));

      vst1_f32(&a[j + 0], vget_low_f32(xx));
      vst1_f32(&a[j + 32], vget_high_f32(xx));
      vst1_f32(&a[j + 16], vget_low_f32(xx4));
      vst1_f32(&a[j + 48], vget_high_f32(xx4));
      vst1_f32(&a[j + 8], vget_low_f32(xx12));
      vst1_f32(&a[j + 40], vget_high_f32(xx12));
      vst1_f32(&a[j + 24], vget_low_f32(xx22));
      vst1_f32(&a[j + 56], vget_high_f32(xx22));
    }
  }
}
Example #8
0
inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); }