test_vdup_lane_f32 () { float32x2_t a; float32x2_t b; int i; float32_t c[2] = { 0.0 , 3.14 }; float32_t d[2]; a = vld1_f32 (c); b = wrap_vdup_lane_f32_0 (a); vst1_f32 (d, b); for (i = 0; i < 2; i++) if (c[0] != d[i]) return 1; b = wrap_vdup_lane_f32_1 (a); vst1_f32 (d, b); for (i = 0; i < 2; i++) if (c[1] != d[i]) return 1; return 0; }
void test_case (float32_t v1[2], float32_t v2[2], float32_t e1[2], float32_t e2[2]) { int i; float32x2_t vec1_1 = vld1_f32 (v1); float32x2_t vec1_2 = vld1_f32 (v2); float32x2_t actual1 = test_vmulx_lane0_f32 (vec1_1, vec1_2); float32_t actual1_1[2]; vst1_f32 (actual1_1, actual1); for (i = 0; i < 2; ++i) if (actual1_1[i] != e1[i]) abort (); float32x2_t actual2 = test_vmulx_lane1_f32 (vec1_1, vec1_2); float32_t actual2_1[2]; vst1_f32 (actual2_1, actual2); for (i = 0; i < 2; ++i) if (actual2_1[i] != e2[i]) abort (); }
test_vreinterpret_f32_f64 () { float64x1_t a; float32x2_t b; float64_t c[1] = { PI_F64 }; /* Values { 0x54442D18, 0x400921FB } reinterpreted as f32. */ float32_t d[2] = { 3.3702805504E12, 2.1426990032196044921875E0 }; float32_t e[2]; int i; a = vld1_f64 (c); b = wrap_vreinterpret_f32_f64 (a); vst1_f32 (e, b); for (i = 0; i < 2; i++) if (!DOUBLE_EQUALS (d[i], e[i], __FLT_EPSILON__)) return 1; return 0; };
void dot_loop(const GLfloat *verts, const GLfloat *params, GLfloat *out, GLint count) { #ifdef __ARM_NEON__ float32x2_t acc; float32x2x3_t vert; float32x2x3_t param = vld3_f32((const float32_t *)params); for (; count != 0; count -= 1) { vert = vld3_f32((const float32_t *)verts); acc = vmul_f32(vert.val[0], param.val[0]); acc = vmla_f32(acc, vert.val[1], param.val[1]); acc = vmla_f32(acc, vert.val[2], param.val[2]); vst1_f32((float32_t *)out, acc); out += 2; verts += 3; } #else for (int i = 0; i < count; i++) { out[0] = dot(verts, params); out += 2; verts += 3; } #endif }
void nnp_conv1x1_only_4x4__neon( size_t input_channels, size_t image_size, const float* input, const float* kernel, float* output) { const float* input0 = input; const float* input1 = input0 + image_size; const float* input2 = input1 + image_size; const float* input3 = input2 + image_size; const float32x4_t vkernel0x = vld1q_f32(kernel); kernel += input_channels; const float32x4_t vkernel1x = vld1q_f32(kernel); kernel += input_channels; const float32x4_t vkernel2x = vld1q_f32(kernel); kernel += input_channels; const float32x4_t vkernel3x = vld1q_f32(kernel); float* output0 = output; float* output1 = output0 + image_size; float* output2 = output1 + image_size; float* output3 = output2 + image_size; while (image_size >= 4) { float32x4_t voutput0 = vld1q_f32(output0); float32x4_t voutput1 = vld1q_f32(output1); float32x4_t voutput2 = vld1q_f32(output2); float32x4_t voutput3 = vld1q_f32(output3); const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); vst1q_f32(output0, voutput0); output0 += 4; vst1q_f32(output1, voutput1); output1 += 4; vst1q_f32(output2, voutput2); output2 += 4; vst1q_f32(output3, voutput3); output3 += 4; image_size -= 4; } if (image_size >= 2) { float32x2_t voutput0 = vld1_f32(output0); float32x2_t voutput1 = vld1_f32(output1); float32x2_t voutput2 = vld1_f32(output2); float32x2_t voutput3 = vld1_f32(output3); const float32x2_t vinput0 = vld1_f32(input0); input0 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); const float32x2_t vinput1 = vld1_f32(input1); input1 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); const float32x2_t vinput2 = vld1_f32(input2); input2 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); const float32x2_t vinput3 = vld1_f32(input3); input3 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); vst1_f32(output0, voutput0); output0 += 2; vst1_f32(output1, voutput1); output1 += 2; vst1_f32(output2, voutput2); output2 += 2; vst1_f32(output3, voutput3); output3 += 2; image_size -= 2; } if (image_size != 0) { float32x2_t voutput0 = vld1_dup_f32(output0); float32x2_t voutput1 = vld1_dup_f32(output1); float32x2_t voutput2 = vld1_dup_f32(output2); float32x2_t voutput3 = vld1_dup_f32(output3); const float32x2_t vinput0 = vld1_dup_f32(input0); voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); const float32x2_t vinput1 = vld1_dup_f32(input1); voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); const float32x2_t vinput2 = vld1_dup_f32(input2); voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); const float32x2_t vinput3 = vld1_dup_f32(input3); voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); vst1_lane_f32(output0, voutput0, 0); vst1_lane_f32(output1, voutput1, 0); vst1_lane_f32(output2, voutput2, 0); vst1_lane_f32(output3, voutput3, 0); } }
void nnp_conv1x1_upto_4x4__neon( uint32_t input_channels_subblock_size, uint32_t output_channels_subblock_size, size_t input_channels, size_t image_size, const float* input, const float* kernel, float* output) { const float*restrict input0 = input; const float*restrict input1 = input_channels_subblock_size > 1 ? input0 + image_size : input0; const float*restrict input2 = input_channels_subblock_size > 2 ? input1 + image_size : input1; const float*restrict input3 = input_channels_subblock_size > 3 ? input2 + image_size : input2; const float*restrict kernel0 = kernel; const float*restrict kernel1 = output_channels_subblock_size > 1 ? kernel0 + input_channels : kernel0; const float*restrict kernel2 = output_channels_subblock_size > 2 ? kernel1 + input_channels : kernel1; const float*restrict kernel3 = output_channels_subblock_size > 3 ? kernel2 + input_channels : kernel2; float32x4_t vkernel0x = vld1q_dup_f32(kernel0); float32x4_t vkernel1x = vld1q_dup_f32(kernel1); float32x4_t vkernel2x = vld1q_dup_f32(kernel2); float32x4_t vkernel3x = vld1q_dup_f32(kernel3); if (input_channels_subblock_size > 1) { vkernel0x = vld1q_lane_f32(kernel0 + 1, vkernel0x, 1); vkernel1x = vld1q_lane_f32(kernel1 + 1, vkernel1x, 1); vkernel2x = vld1q_lane_f32(kernel2 + 1, vkernel2x, 1); vkernel3x = vld1q_lane_f32(kernel3 + 1, vkernel3x, 1); if (input_channels_subblock_size > 2) { vkernel0x = vld1q_lane_f32(kernel0 + 2, vkernel0x, 2); vkernel1x = vld1q_lane_f32(kernel1 + 2, vkernel1x, 2); vkernel2x = vld1q_lane_f32(kernel2 + 2, vkernel2x, 2); vkernel3x = vld1q_lane_f32(kernel3 + 2, vkernel3x, 2); if (input_channels_subblock_size > 3) { vkernel0x = vld1q_lane_f32(kernel0 + 3, vkernel0x, 3); vkernel1x = vld1q_lane_f32(kernel1 + 3, vkernel1x, 3); vkernel2x = vld1q_lane_f32(kernel2 + 3, vkernel2x, 3); vkernel3x = vld1q_lane_f32(kernel3 + 3, vkernel3x, 3); } } } float*restrict output0 = output; float*restrict output1 = output_channels_subblock_size > 1 ? output0 + image_size : output0; float*restrict output2 = output_channels_subblock_size > 2 ? output1 + image_size : output1; float*restrict output3 = output_channels_subblock_size > 3 ? output2 + image_size : output2; while (image_size >= 4) { float32x4_t voutput0 = vld1q_f32(output0); float32x4_t voutput1 = vld1q_f32(output1); float32x4_t voutput2 = vld1q_f32(output2); float32x4_t voutput3 = vld1q_f32(output3); const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 1) { const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 2) { const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); if (input_channels_subblock_size > 3) { const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); } } } vst1q_f32(output0, voutput0); output0 += 4; if (output_channels_subblock_size > 1) { vst1q_f32(output1, voutput1); output1 += 4; if (output_channels_subblock_size > 2) { vst1q_f32(output2, voutput2); output2 += 4; if (output_channels_subblock_size > 3) { vst1q_f32(output3, voutput3); output3 += 4; } } } image_size -= 4; } if (image_size >= 2) { float32x2_t voutput0 = vld1_f32(output0); float32x2_t voutput1 = vld1_f32(output1); float32x2_t voutput2 = vld1_f32(output2); float32x2_t voutput3 = vld1_f32(output3); const float32x2_t vinput0 = vld1_f32(input0); input0 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 1) { const float32x2_t vinput1 = vld1_f32(input1); input1 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 2) { const float32x2_t vinput2 = vld1_f32(input2); input2 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); if (input_channels_subblock_size > 3) { const float32x2_t vinput3 = vld1_f32(input3); input3 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); } } } vst1_f32(output0, voutput0); output0 += 2; if (output_channels_subblock_size > 1) { vst1_f32(output1, voutput1); output1 += 2; if (output_channels_subblock_size > 2) { vst1_f32(output2, voutput2); output2 += 2; if (output_channels_subblock_size > 3) { vst1_f32(output3, voutput3); output3 += 2; } } } image_size -= 2; } if (image_size != 0) { float32x2_t voutput0 = vld1_dup_f32(output0); float32x2_t voutput1 = vld1_dup_f32(output1); float32x2_t voutput2 = vld1_dup_f32(output2); float32x2_t voutput3 = vld1_dup_f32(output3); const float32x2_t vinput0 = vld1_dup_f32(input0); voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 1) { const float32x2_t vinput1 = vld1_dup_f32(input1); voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 2) { const float32x2_t vinput2 = vld1_dup_f32(input2); voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); if (input_channels_subblock_size > 3) { const float32x2_t vinput3 = vld1_dup_f32(input3); voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); } } } vst1_lane_f32(output0, voutput0, 0); if (output_channels_subblock_size > 1) { vst1_lane_f32(output1, voutput1, 0); if (output_channels_subblock_size > 2) { vst1_lane_f32(output2, voutput2, 0); if (output_channels_subblock_size > 3) { vst1_lane_f32(output3, voutput3, 0); } } } } }
static void cftmdl_128_neon(float* a) { int j; const int l = 8; const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r); for (j = 0; j < l; j += 2) { const float32x2_t a_00 = vld1_f32(&a[j + 0]); const float32x2_t a_08 = vld1_f32(&a[j + 8]); const float32x2_t a_32 = vld1_f32(&a[j + 32]); const float32x2_t a_40 = vld1_f32(&a[j + 40]); const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); const float32x2_t a_16 = vld1_f32(&a[j + 16]); const float32x2_t a_24 = vld1_f32(&a[j + 24]); const float32x2_t a_48 = vld1_f32(&a[j + 48]); const float32x2_t a_56 = vld1_f32(&a[j + 56]); const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); const float32x4_t x1_x3_add = vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x4_t x1_x3_sub = vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0); const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0); const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s); const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1); const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1); const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s); const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as); const float32x4_t yy4 = vmulq_f32(wk1rv, yy0); const float32x4_t xx1_rev = vrev64q_f32(xx1); const float32x4_t yy4_rev = vrev64q_f32(yy4); vst1_f32(&a[j + 0], vget_low_f32(xx0)); vst1_f32(&a[j + 32], vget_high_f32(xx0)); vst1_f32(&a[j + 16], vget_low_f32(xx1)); vst1_f32(&a[j + 48], vget_high_f32(xx1_rev)); a[j + 48] = -a[j + 48]; vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add)); vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub)); vst1_f32(&a[j + 40], vget_low_f32(yy4)); vst1_f32(&a[j + 56], vget_high_f32(yy4_rev)); } { const int k = 64; const int k1 = 2; const int k2 = 2 * k1; const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]); const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]); const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]); const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]); const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]); wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]); for (j = k; j < l + k; j += 2) { const float32x2_t a_00 = vld1_f32(&a[j + 0]); const float32x2_t a_08 = vld1_f32(&a[j + 8]); const float32x2_t a_32 = vld1_f32(&a[j + 32]); const float32x2_t a_40 = vld1_f32(&a[j + 40]); const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); const float32x2_t a_16 = vld1_f32(&a[j + 16]); const float32x2_t a_24 = vld1_f32(&a[j + 24]); const float32x2_t a_48 = vld1_f32(&a[j + 48]); const float32x2_t a_56 = vld1_f32(&a[j + 56]); const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); const float32x4_t x1_x3_add = vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x4_t x1_x3_sub = vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); float32x4_t xx4 = vmulq_f32(wk2rv, xx1); float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add); float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub); xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1)); xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add)); xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub)); vst1_f32(&a[j + 0], vget_low_f32(xx)); vst1_f32(&a[j + 32], vget_high_f32(xx)); vst1_f32(&a[j + 16], vget_low_f32(xx4)); vst1_f32(&a[j + 48], vget_high_f32(xx4)); vst1_f32(&a[j + 8], vget_low_f32(xx12)); vst1_f32(&a[j + 40], vget_high_f32(xx12)); vst1_f32(&a[j + 24], vget_low_f32(xx22)); vst1_f32(&a[j + 56], vget_high_f32(xx22)); } } }
inline void vst1(f32 * ptr, const float32x2_t & v) { return vst1_f32(ptr, v); }