f64 dotProduct(const Size2D &_size, const f32 * src0Base, ptrdiff_t src0Stride, const f32 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width * sizeof(f32))) { size.width *= size.height; size.height = 1; } #define DOT_FLOAT_BLOCKSIZE (1 << 13) f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; while(i + 4 <= size.width) { size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4; float32x4_t v_sum = vdupq_n_f32(0.0f); for( ; i <= lim; i += 4 ) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i)); } float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); } if(i + 2 <= size.width) { float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); i += 2; } for (; i < size.width; ++i) result += src0[i] * src1[i]; } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }
/* * Scale FFT data by 1/|length|. |length| must be a power of two */ static inline ScaleRFFTData(OMX_F32* fftData, unsigned length) { float32_t* data = (float32_t*)fftData; float32_t scale = 2.0f / length; if (length >= 4) { /* * Do 4 float elements at a time because |length| is always a * multiple of 4 when |length| >= 4. * * TODO(rtoy): Figure out how to process 8 elements at a time * using intrinsics or replace this with inline assembly. */ do { float32x4_t x = vld1q_f32(data); length -= 4; x = vmulq_n_f32(x, scale); vst1q_f32(data, x); data += 4; } while (length > 0); } else if (length == 2) { float32x2_t x = vld1_f32(data); x = vmul_n_f32(x, scale); vst1_f32(data, x); } else { fftData[0] *= scale; } }
static inline void mix_block_into(cbox_sample_t **outputs, int oofs, float *src_leftright) { float *dst_left = outputs[oofs]; float *dst_right = outputs[oofs + 1]; for (size_t i = 0; i < CBOX_BLOCK_SIZE; i += 2) { float32x2_t lr1 = vld1_f32(&src_leftright[2 * i]); float32x2_t lr2 = vld1_f32(&src_leftright[2 * i + 2]); float32x2x2_t lr12 = vtrn_f32(lr1, lr2); float32x2_t dl1 = vld1_f32(&dst_left[i]); float32x2_t dr1 = vld1_f32(&dst_right[i]); float32x2_t l1 = vadd_f32(dl1, lr12.val[0]); vst1_f32(&dst_left[i], l1); float32x2_t r1 = vadd_f32(dr1, lr12.val[1]); vst1_f32(&dst_right[i], r1); } }
test_vdups_lane_f32 () { float32x2_t a; float32_t b; float32_t c[2] = { 0.0, 1.0 }; a = vld1_f32 (c); b = wrap_vdups_lane_f32_0 (a, a); if (c[0] != b) return 1; b = wrap_vdups_lane_f32_1 (a); if (c[1] != b) return 1; return 0; }
void test_square_root_v2sf () { const float32_t pool[] = {4.0f, 9.0f}; float32x2_t val; float32x2_t res; val = vld1_f32 (pool); res = vsqrt_f32 (val); if (vget_lane_f32 (res, 0) != 2.0f) abort (); if (vget_lane_f32 (res, 1) != 3.0f) abort (); }
void test_case (float32_t v1[2], float32_t v2[2], float32_t e1[2], float32_t e2[2]) { int i; float32x2_t vec1_1 = vld1_f32 (v1); float32x2_t vec1_2 = vld1_f32 (v2); float32x2_t actual1 = test_vmulx_lane0_f32 (vec1_1, vec1_2); float32_t actual1_1[2]; vst1_f32 (actual1_1, actual1); for (i = 0; i < 2; ++i) if (actual1_1[i] != e1[i]) abort (); float32x2_t actual2 = test_vmulx_lane1_f32 (vec1_1, vec1_2); float32_t actual2_1[2]; vst1_f32 (actual2_1, actual2); for (i = 0; i < 2; ++i) if (actual2_1[i] != e2[i]) abort (); }
test_vreinterpret_f64_f32 () { float32x2_t a; float64x1_t b; /* Values { 0x54442D18, 0x400921FB } reinterpreted as f32. */ float32_t c[2] = { 3.3702805504E12, 2.1426990032196044921875E0 }; float64_t d[1] = { PI_F64 }; float64_t e[1]; int i; a = vld1_f32 (c); b = wrap_vreinterpret_f64_f32 (a); vst1_f64 (e, b); if (!DOUBLE_EQUALS (d[0], e[0], __DBL_EPSILON__)) return 1; return 0; };
void processVirtualizerNeon(int16_t* buffer, uint32_t sizeInFrames) { effectsFramesBeforeRecoveringGain -= sizeInFrames; float32x2_t gainClip = vld1_f32(effectsGainClip); float32x2_t maxAbsSample = vdup_n_f32(0.0f); while ((sizeInFrames--)) { float *samples = equalizerSamples; effectsTemp[0] = (int32_t)buffer[0]; effectsTemp[1] = (int32_t)buffer[1]; //inLR = { L, R } float32x2_t inLR = vcvt_f32_s32(*((int32x2_t*)effectsTemp)); virtualizerNeon(); floatToShortNeon(); } footerNeon(); }
test_vdupq_lane_f32 () { float32x2_t a; float32x4_t b; int i; float32_t c[2] = { 0.0 , 3.14 }; float32_t d[4]; a = vld1_f32 (c); b = wrap_vdupq_lane_f32_0 (a); vst1q_f32 (d, b); for (i = 0; i < 4; i++) if (c[0] != d[i]) return 1; b = wrap_vdupq_lane_f32_1 (a); vst1q_f32 (d, b); for (i = 0; i < 4; i++) if (c[1] != d[i]) return 1; return 0; }
void nnp_conv1x1_only_4x4__neon( size_t input_channels, size_t image_size, const float* input, const float* kernel, float* output) { const float* input0 = input; const float* input1 = input0 + image_size; const float* input2 = input1 + image_size; const float* input3 = input2 + image_size; const float32x4_t vkernel0x = vld1q_f32(kernel); kernel += input_channels; const float32x4_t vkernel1x = vld1q_f32(kernel); kernel += input_channels; const float32x4_t vkernel2x = vld1q_f32(kernel); kernel += input_channels; const float32x4_t vkernel3x = vld1q_f32(kernel); float* output0 = output; float* output1 = output0 + image_size; float* output2 = output1 + image_size; float* output3 = output2 + image_size; while (image_size >= 4) { float32x4_t voutput0 = vld1q_f32(output0); float32x4_t voutput1 = vld1q_f32(output1); float32x4_t voutput2 = vld1q_f32(output2); float32x4_t voutput3 = vld1q_f32(output3); const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); vst1q_f32(output0, voutput0); output0 += 4; vst1q_f32(output1, voutput1); output1 += 4; vst1q_f32(output2, voutput2); output2 += 4; vst1q_f32(output3, voutput3); output3 += 4; image_size -= 4; } if (image_size >= 2) { float32x2_t voutput0 = vld1_f32(output0); float32x2_t voutput1 = vld1_f32(output1); float32x2_t voutput2 = vld1_f32(output2); float32x2_t voutput3 = vld1_f32(output3); const float32x2_t vinput0 = vld1_f32(input0); input0 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); const float32x2_t vinput1 = vld1_f32(input1); input1 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); const float32x2_t vinput2 = vld1_f32(input2); input2 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); const float32x2_t vinput3 = vld1_f32(input3); input3 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); vst1_f32(output0, voutput0); output0 += 2; vst1_f32(output1, voutput1); output1 += 2; vst1_f32(output2, voutput2); output2 += 2; vst1_f32(output3, voutput3); output3 += 2; image_size -= 2; } if (image_size != 0) { float32x2_t voutput0 = vld1_dup_f32(output0); float32x2_t voutput1 = vld1_dup_f32(output1); float32x2_t voutput2 = vld1_dup_f32(output2); float32x2_t voutput3 = vld1_dup_f32(output3); const float32x2_t vinput0 = vld1_dup_f32(input0); voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); const float32x2_t vinput1 = vld1_dup_f32(input1); voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); const float32x2_t vinput2 = vld1_dup_f32(input2); voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); const float32x2_t vinput3 = vld1_dup_f32(input3); voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); vst1_lane_f32(output0, voutput0, 0); vst1_lane_f32(output1, voutput1, 0); vst1_lane_f32(output2, voutput2, 0); vst1_lane_f32(output3, voutput3, 0); } }
void nnp_conv1x1_upto_4x4__neon( uint32_t input_channels_subblock_size, uint32_t output_channels_subblock_size, size_t input_channels, size_t image_size, const float* input, const float* kernel, float* output) { const float*restrict input0 = input; const float*restrict input1 = input_channels_subblock_size > 1 ? input0 + image_size : input0; const float*restrict input2 = input_channels_subblock_size > 2 ? input1 + image_size : input1; const float*restrict input3 = input_channels_subblock_size > 3 ? input2 + image_size : input2; const float*restrict kernel0 = kernel; const float*restrict kernel1 = output_channels_subblock_size > 1 ? kernel0 + input_channels : kernel0; const float*restrict kernel2 = output_channels_subblock_size > 2 ? kernel1 + input_channels : kernel1; const float*restrict kernel3 = output_channels_subblock_size > 3 ? kernel2 + input_channels : kernel2; float32x4_t vkernel0x = vld1q_dup_f32(kernel0); float32x4_t vkernel1x = vld1q_dup_f32(kernel1); float32x4_t vkernel2x = vld1q_dup_f32(kernel2); float32x4_t vkernel3x = vld1q_dup_f32(kernel3); if (input_channels_subblock_size > 1) { vkernel0x = vld1q_lane_f32(kernel0 + 1, vkernel0x, 1); vkernel1x = vld1q_lane_f32(kernel1 + 1, vkernel1x, 1); vkernel2x = vld1q_lane_f32(kernel2 + 1, vkernel2x, 1); vkernel3x = vld1q_lane_f32(kernel3 + 1, vkernel3x, 1); if (input_channels_subblock_size > 2) { vkernel0x = vld1q_lane_f32(kernel0 + 2, vkernel0x, 2); vkernel1x = vld1q_lane_f32(kernel1 + 2, vkernel1x, 2); vkernel2x = vld1q_lane_f32(kernel2 + 2, vkernel2x, 2); vkernel3x = vld1q_lane_f32(kernel3 + 2, vkernel3x, 2); if (input_channels_subblock_size > 3) { vkernel0x = vld1q_lane_f32(kernel0 + 3, vkernel0x, 3); vkernel1x = vld1q_lane_f32(kernel1 + 3, vkernel1x, 3); vkernel2x = vld1q_lane_f32(kernel2 + 3, vkernel2x, 3); vkernel3x = vld1q_lane_f32(kernel3 + 3, vkernel3x, 3); } } } float*restrict output0 = output; float*restrict output1 = output_channels_subblock_size > 1 ? output0 + image_size : output0; float*restrict output2 = output_channels_subblock_size > 2 ? output1 + image_size : output1; float*restrict output3 = output_channels_subblock_size > 3 ? output2 + image_size : output2; while (image_size >= 4) { float32x4_t voutput0 = vld1q_f32(output0); float32x4_t voutput1 = vld1q_f32(output1); float32x4_t voutput2 = vld1q_f32(output2); float32x4_t voutput3 = vld1q_f32(output3); const float32x4_t vinput0 = vld1q_f32(input0); input0 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 1) { const float32x4_t vinput1 = vld1q_f32(input1); input1 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 2) { const float32x4_t vinput2 = vld1q_f32(input2); input2 += 4; voutput0 = vmuladdq_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); if (input_channels_subblock_size > 3) { const float32x4_t vinput3 = vld1q_f32(input3); input3 += 4; voutput0 = vmuladdq_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladdq_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladdq_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladdq_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); } } } vst1q_f32(output0, voutput0); output0 += 4; if (output_channels_subblock_size > 1) { vst1q_f32(output1, voutput1); output1 += 4; if (output_channels_subblock_size > 2) { vst1q_f32(output2, voutput2); output2 += 4; if (output_channels_subblock_size > 3) { vst1q_f32(output3, voutput3); output3 += 4; } } } image_size -= 4; } if (image_size >= 2) { float32x2_t voutput0 = vld1_f32(output0); float32x2_t voutput1 = vld1_f32(output1); float32x2_t voutput2 = vld1_f32(output2); float32x2_t voutput3 = vld1_f32(output3); const float32x2_t vinput0 = vld1_f32(input0); input0 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 1) { const float32x2_t vinput1 = vld1_f32(input1); input1 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 2) { const float32x2_t vinput2 = vld1_f32(input2); input2 += 2; voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); if (input_channels_subblock_size > 3) { const float32x2_t vinput3 = vld1_f32(input3); input3 += 2; voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); } } } vst1_f32(output0, voutput0); output0 += 2; if (output_channels_subblock_size > 1) { vst1_f32(output1, voutput1); output1 += 2; if (output_channels_subblock_size > 2) { vst1_f32(output2, voutput2); output2 += 2; if (output_channels_subblock_size > 3) { vst1_f32(output3, voutput3); output3 += 2; } } } image_size -= 2; } if (image_size != 0) { float32x2_t voutput0 = vld1_dup_f32(output0); float32x2_t voutput1 = vld1_dup_f32(output1); float32x2_t voutput2 = vld1_dup_f32(output2); float32x2_t voutput3 = vld1_dup_f32(output3); const float32x2_t vinput0 = vld1_dup_f32(input0); voutput0 = vmuladd_lane0_f32(voutput0, vinput0, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput0, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput0, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput0, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 1) { const float32x2_t vinput1 = vld1_dup_f32(input1); voutput0 = vmuladd_lane1_f32(voutput0, vinput1, vget_low_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput1, vget_low_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput1, vget_low_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput1, vget_low_f32(vkernel3x)); if (input_channels_subblock_size > 2) { const float32x2_t vinput2 = vld1_dup_f32(input2); voutput0 = vmuladd_lane0_f32(voutput0, vinput2, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane0_f32(voutput1, vinput2, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane0_f32(voutput2, vinput2, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane0_f32(voutput3, vinput2, vget_high_f32(vkernel3x)); if (input_channels_subblock_size > 3) { const float32x2_t vinput3 = vld1_dup_f32(input3); voutput0 = vmuladd_lane1_f32(voutput0, vinput3, vget_high_f32(vkernel0x)); voutput1 = vmuladd_lane1_f32(voutput1, vinput3, vget_high_f32(vkernel1x)); voutput2 = vmuladd_lane1_f32(voutput2, vinput3, vget_high_f32(vkernel2x)); voutput3 = vmuladd_lane1_f32(voutput3, vinput3, vget_high_f32(vkernel3x)); } } } vst1_lane_f32(output0, voutput0, 0); if (output_channels_subblock_size > 1) { vst1_lane_f32(output1, voutput1, 0); if (output_channels_subblock_size > 2) { vst1_lane_f32(output2, voutput2, 0); if (output_channels_subblock_size > 3) { vst1_lane_f32(output3, voutput3, 0); } } } } }
void test_vld1f32 (void) { float32x2_t out_float32x2_t; out_float32x2_t = vld1_f32 (0); }
static void cftmdl_128_neon(float* a) { int j; const int l = 8; const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r); for (j = 0; j < l; j += 2) { const float32x2_t a_00 = vld1_f32(&a[j + 0]); const float32x2_t a_08 = vld1_f32(&a[j + 8]); const float32x2_t a_32 = vld1_f32(&a[j + 32]); const float32x2_t a_40 = vld1_f32(&a[j + 40]); const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); const float32x2_t a_16 = vld1_f32(&a[j + 16]); const float32x2_t a_24 = vld1_f32(&a[j + 24]); const float32x2_t a_48 = vld1_f32(&a[j + 48]); const float32x2_t a_56 = vld1_f32(&a[j + 56]); const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); const float32x4_t x1_x3_add = vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x4_t x1_x3_sub = vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0); const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0); const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s); const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1); const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1); const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s); const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as); const float32x4_t yy4 = vmulq_f32(wk1rv, yy0); const float32x4_t xx1_rev = vrev64q_f32(xx1); const float32x4_t yy4_rev = vrev64q_f32(yy4); vst1_f32(&a[j + 0], vget_low_f32(xx0)); vst1_f32(&a[j + 32], vget_high_f32(xx0)); vst1_f32(&a[j + 16], vget_low_f32(xx1)); vst1_f32(&a[j + 48], vget_high_f32(xx1_rev)); a[j + 48] = -a[j + 48]; vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add)); vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub)); vst1_f32(&a[j + 40], vget_low_f32(yy4)); vst1_f32(&a[j + 56], vget_high_f32(yy4_rev)); } { const int k = 64; const int k1 = 2; const int k2 = 2 * k1; const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]); const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]); const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]); const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]); const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]); wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]); for (j = k; j < l + k; j += 2) { const float32x2_t a_00 = vld1_f32(&a[j + 0]); const float32x2_t a_08 = vld1_f32(&a[j + 8]); const float32x2_t a_32 = vld1_f32(&a[j + 32]); const float32x2_t a_40 = vld1_f32(&a[j + 40]); const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); const float32x2_t a_16 = vld1_f32(&a[j + 16]); const float32x2_t a_24 = vld1_f32(&a[j + 24]); const float32x2_t a_48 = vld1_f32(&a[j + 48]); const float32x2_t a_56 = vld1_f32(&a[j + 56]); const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); const float32x4_t x1_x3_add = vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x4_t x1_x3_sub = vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); float32x4_t xx4 = vmulq_f32(wk2rv, xx1); float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add); float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub); xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1)); xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add)); xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub)); vst1_f32(&a[j + 0], vget_low_f32(xx)); vst1_f32(&a[j + 32], vget_high_f32(xx)); vst1_f32(&a[j + 16], vget_low_f32(xx4)); vst1_f32(&a[j + 48], vget_high_f32(xx4)); vst1_f32(&a[j + 8], vget_low_f32(xx12)); vst1_f32(&a[j + 40], vget_high_f32(xx12)); vst1_f32(&a[j + 24], vget_low_f32(xx22)); vst1_f32(&a[j + 56], vget_high_f32(xx22)); } } }
inline float32x2_t vld1(const f32 * ptr) { return vld1_f32(ptr); }