template <bool align> SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) { assert(size >= F); if (align) assert(Aligned(a) && Aligned(b)); size_t partialAlignedSize = AlignLo(size, F); size_t fullAlignedSize = AlignLo(size, DF); size_t i = 0; float32x4_t sums[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; if (fullAlignedSize) { for (; i < fullAlignedSize; i += DF) { SquaredDifferenceSum16f<align>(a, b, i + F * 0, sums[0]); SquaredDifferenceSum16f<align>(a, b, i + F * 1, sums[1]); } sums[0] = vaddq_f32(sums[0], sums[1]); } for (; i < partialAlignedSize; i += F) SquaredDifferenceSum16f<align>(a, b, i, sums[0]); if (partialAlignedSize != size) { float32x4_t tailMask = RightNotZero(size - partialAlignedSize); float32x4_t _a = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F)); float32x4_t _b = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F)); float32x4_t _d = And(vsubq_f32(_a, _b), tailMask); sums[0] = vaddq_f32(sums[0], vmulq_f32(_d, _d)); } *sum = ExtractSum32f(sums[0]); }
static void ScaleErrorSignalNEON(int extended_filter_enabled, float normal_mu, float normal_error_threshold, float x_pow[PART_LEN1], float ef[2][PART_LEN1]) { const float mu = extended_filter_enabled ? kExtendedMu : normal_mu; const float error_threshold = extended_filter_enabled ? kExtendedErrorThreshold : normal_error_threshold; const float32x4_t k1e_10f = vdupq_n_f32(1e-10f); const float32x4_t kMu = vmovq_n_f32(mu); const float32x4_t kThresh = vmovq_n_f32(error_threshold); int i; // vectorized code (four at once) for (i = 0; i + 3 < PART_LEN1; i += 4) { const float32x4_t x_pow_local = vld1q_f32(&x_pow[i]); const float32x4_t ef_re_base = vld1q_f32(&ef[0][i]); const float32x4_t ef_im_base = vld1q_f32(&ef[1][i]); const float32x4_t xPowPlus = vaddq_f32(x_pow_local, k1e_10f); float32x4_t ef_re = vdivq_f32(ef_re_base, xPowPlus); float32x4_t ef_im = vdivq_f32(ef_im_base, xPowPlus); const float32x4_t ef_re2 = vmulq_f32(ef_re, ef_re); const float32x4_t ef_sum2 = vmlaq_f32(ef_re2, ef_im, ef_im); const float32x4_t absEf = vsqrtq_f32(ef_sum2); const uint32x4_t bigger = vcgtq_f32(absEf, kThresh); const float32x4_t absEfPlus = vaddq_f32(absEf, k1e_10f); const float32x4_t absEfInv = vdivq_f32(kThresh, absEfPlus); uint32x4_t ef_re_if = vreinterpretq_u32_f32(vmulq_f32(ef_re, absEfInv)); uint32x4_t ef_im_if = vreinterpretq_u32_f32(vmulq_f32(ef_im, absEfInv)); uint32x4_t ef_re_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_re)); uint32x4_t ef_im_u32 = vandq_u32(vmvnq_u32(bigger), vreinterpretq_u32_f32(ef_im)); ef_re_if = vandq_u32(bigger, ef_re_if); ef_im_if = vandq_u32(bigger, ef_im_if); ef_re_u32 = vorrq_u32(ef_re_u32, ef_re_if); ef_im_u32 = vorrq_u32(ef_im_u32, ef_im_if); ef_re = vmulq_f32(vreinterpretq_f32_u32(ef_re_u32), kMu); ef_im = vmulq_f32(vreinterpretq_f32_u32(ef_im_u32), kMu); vst1q_f32(&ef[0][i], ef_re); vst1q_f32(&ef[1][i], ef_im); } // scalar code for the remaining items. for (; i < PART_LEN1; i++) { float abs_ef; ef[0][i] /= (x_pow[i] + 1e-10f); ef[1][i] /= (x_pow[i] + 1e-10f); abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]); if (abs_ef > error_threshold) { abs_ef = error_threshold / (abs_ef + 1e-10f); ef[0][i] *= abs_ef; ef[1][i] *= abs_ef; } // Stepsize factor ef[0][i] *= mu; ef[1][i] *= mu; } }
void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) { size_t i; unsigned char alpha; float32_t (*mat)[4] = transform->matrix; const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r; const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g; const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b; const uint8_t *otdata_r = &transform->output_table_r->data[0]; const uint8_t *otdata_g = &transform->output_table_g->data[0]; const uint8_t *otdata_b = &transform->output_table_b->data[0]; const float32x4_t mat0 = vld1q_f32(mat[0]); const float32x4_t mat1 = vld1q_f32(mat[1]); const float32x4_t mat2 = vld1q_f32(mat[2]); const float32x4_t max = vld1q_dup_f32(&clampMaxValue); const float32x4_t min = vld1q_dup_f32(&zero); const float32x4_t scale = vld1q_dup_f32(&floatScale); float32x4_t vec_r, vec_g, vec_b; int32x4_t result; /* CYA */ if (!length) return; for (i = 0; i < length; i++) { /* setup for transforming the pixel */ vec_r = vld1q_dup_f32(&igtbl_r[*src++]); vec_g = vld1q_dup_f32(&igtbl_g[*src++]); vec_b = vld1q_dup_f32(&igtbl_b[*src++]); alpha = *src++; /* gamma * matrix */ vec_r = vmulq_f32(vec_r, mat0); vec_g = vmulq_f32(vec_g, mat1); vec_b = vmulq_f32(vec_b, mat2); /* crunch, crunch, crunch */ vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b)); vec_r = vmaxq_f32(min, vec_r); vec_r = vminq_f32(max, vec_r); result = vcvtq_s32_f32(vmulq_f32(vec_r, scale)); /* use calc'd indices to output RGB values */ *dest++ = otdata_r[vgetq_lane_s32(result, 0)]; *dest++ = otdata_g[vgetq_lane_s32(result, 1)]; *dest++ = otdata_b[vgetq_lane_s32(result, 2)]; *dest++ = alpha; } }
static inline void neon_make_rgb(float32x4_t macropixel, float32x4_t *rgba0p, float32x4_t *rgba1p) { const float32x4_t u_coeff = {0.0, -0.34455, 1.7790, 0.0 }; const float32x4_t v_coeff = {1.4075, -0.7169, 0.0, 0.0 }; float32x4_t y0_vec, y1_vec, u_vec, v_vec, uv_vec; float32x2_t y0_u, y1_v; const float32_t alpha = 255.0; /* macropixel is [Y0, U, Y1, V]. */ /* since vdupq_lane_f32 will only take two element vectors we */ /* need to pick macropixel apart to build vectors of the components. */ /* so make y0_u be the first half of macropixel [Y0, U] and */ /* y1_v be the second half [Y1, V]. */ y0_u = vget_low_f32(macropixel); y1_v = vget_high_f32(macropixel); /* now copy Y0 to all elements of y0_vec, then overwrite element 3 */ /* with alpha. */ y0_vec = vdupq_lane_f32(y0_u, 0); y0_vec = vsetq_lane_f32(alpha, y0_vec, 3); /* make u_vec be [U, U, U, U]. we'll do that using */ /* vdupq_lane_f32 and selecting U (element 1) from y0_u */ u_vec = vdupq_lane_f32(y0_u, 1); /* now copy Y1 to all elements of y1_vec, then overwrite element 3 */ /* with alpha. */ y1_vec = vdupq_lane_f32(y1_v, 0); y1_vec = vsetq_lane_f32(alpha, y1_vec, 3); /* make v_vec be [V, V, V, V]. we'll do that using */ /* vdupq_lane_f32 and selecting V (element 1) from y1_v */ v_vec = vdupq_lane_f32(y1_v, 1); /* now multiply u_vec * u_coeff and v_vec by v_coeff. */ u_vec = vmulq_f32(u_vec, u_coeff); v_vec = vmulq_f32(v_vec, v_coeff); /* add u_vec and v_vec to form uv_vec. use that to build */ /* rgba0 and rgba1 by adding y0_vec, y1_vec*/ uv_vec = vaddq_f32(u_vec, v_vec); *rgba0p = vaddq_f32(y0_vec, uv_vec); *rgba1p = vaddq_f32(y1_vec, uv_vec); }
/* f32x4 mm mul */ void mw_neon_mm_mul_f32x4(float * A, int Row, int T, float * B, int Col, float * C) { int i, k, j; float32x4_t neon_b, neon_c; float32x4_t neon_a0, neon_a1, neon_a2, neon_a3; float32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_f32(0); for (j = 0; j < T; j+=4) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_f32(A + j_T); j_T+=Row; neon_a1 = vld1q_f32(A + j_T); j_T+=Row; neon_a2 = vld1q_f32(A + j_T); j_T+=Row; neon_a3 = vld1q_f32(A + j_T); neon_b = vld1q_f32(B + k_Row + j); neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0)); neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1)); neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2)); neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3)); neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c); vst1q_lane_f32(C + k_Row + i, neon_c, 0); vst1q_lane_f32(C + k_Row + i + 1, neon_c, 1); vst1q_lane_f32(C + k_Row + i + 2, neon_c, 2); vst1q_lane_f32(C + k_Row + i + 3, neon_c, 3); } } } }
static void cft1st_128_neon(float* a) { const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); int j, k2; for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) { float32x4_t a00v = vld1q_f32(&a[j + 0]); float32x4_t a04v = vld1q_f32(&a[j + 4]); float32x4_t a08v = vld1q_f32(&a[j + 8]); float32x4_t a12v = vld1q_f32(&a[j + 12]); float32x4_t a01v = vcombine_f32(vget_low_f32(a00v), vget_low_f32(a08v)); float32x4_t a23v = vcombine_f32(vget_high_f32(a00v), vget_high_f32(a08v)); float32x4_t a45v = vcombine_f32(vget_low_f32(a04v), vget_low_f32(a12v)); float32x4_t a67v = vcombine_f32(vget_high_f32(a04v), vget_high_f32(a12v)); const float32x4_t wk1rv = vld1q_f32(&rdft_wk1r[k2]); const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2]); const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2]); const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2]); const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2]); const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2]); float32x4_t x0v = vaddq_f32(a01v, a23v); const float32x4_t x1v = vsubq_f32(a01v, a23v); const float32x4_t x2v = vaddq_f32(a45v, a67v); const float32x4_t x3v = vsubq_f32(a45v, a67v); const float32x4_t x3w = vrev64q_f32(x3v); float32x4_t x0w; a01v = vaddq_f32(x0v, x2v); x0v = vsubq_f32(x0v, x2v); x0w = vrev64q_f32(x0v); a45v = vmulq_f32(wk2rv, x0v); a45v = vmlaq_f32(a45v, wk2iv, x0w); x0v = vmlaq_f32(x1v, x3w, vec_swap_sign); x0w = vrev64q_f32(x0v); a23v = vmulq_f32(wk1rv, x0v); a23v = vmlaq_f32(a23v, wk1iv, x0w); x0v = vmlsq_f32(x1v, x3w, vec_swap_sign); x0w = vrev64q_f32(x0v); a67v = vmulq_f32(wk3rv, x0v); a67v = vmlaq_f32(a67v, wk3iv, x0w); a00v = vcombine_f32(vget_low_f32(a01v), vget_low_f32(a23v)); a04v = vcombine_f32(vget_low_f32(a45v), vget_low_f32(a67v)); a08v = vcombine_f32(vget_high_f32(a01v), vget_high_f32(a23v)); a12v = vcombine_f32(vget_high_f32(a45v), vget_high_f32(a67v)); vst1q_f32(&a[j + 0], a00v); vst1q_f32(&a[j + 4], a04v); vst1q_f32(&a[j + 8], a08v); vst1q_f32(&a[j + 12], a12v); } }
static void test_fma() { for(int i=0; i<1020 * 4; i++) { data_f[i] = i; } float32x4_t c0_02 = vdupq_n_f32(0.02f); float32x4_t c0_04 = vdupq_n_f32(0.04f); float32x4_t c0_05 = vdupq_n_f32(0.05f); float32x4_t c0_10 = vdupq_n_f32(0.1f); float32x4_t c0_20 = vdupq_n_f32(0.2f); float32x4_t c1_00 = vdupq_n_f32(1.0f); startTime(); // Do ~1 billion ops for (int ct=0; ct < (1000 * (1000 / 80)); ct++) { for (int i=0; i < 1000; i++) { float32x4_t t; t = vmulq_f32(vld1q_f32((float32_t *)&data_f[i]), c0_02); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+4]), c0_04); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+8]), c0_05); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+12]), c0_10); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+16]), c0_20); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+20]), c0_20); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+24]), c0_10); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+28]), c0_05); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+32]), c0_04); t = vmlaq_f32(t, vld1q_f32((float32_t *)&data_f[i+36]), c0_02); t = vaddq_f32(t, c1_00); vst1q_f32((float32_t *)&data_f[i], t); } } endTime("neon fma", 1e9); }
static inline float32x4_t floor_neon(float32x4_t a) { #if __ARM_ARCH >= 8 return vrndqm_f32(a); #else const float32x4_t round32 = vdupq_n_f32(12582912.0f); const float32x4_t vhalf = vdupq_n_f32(0.5f); float32x4_t rounded = vsubq_f32(vaddq_f32(a, round32), round32); uint32x4_t mask = vceqq_f32(a, rounded); float32x4_t floored = vsubq_f32(vaddq_f32(vsubq_f32(a, vhalf), round32), round32); return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(a), mask), vbicq_u32(vreinterpretq_u32_f32(floored), mask))); #endif }
//----------------------------------------------------------------------------------- void MathlibNEON::SinCos4( ArrayReal x, ArrayReal &outSin, ArrayReal &outCos ) { // TODO: Improve accuracy by mapping to the range [-pi/4, pi/4] and swap // between cos & sin depending on which quadrant it fell: // Quadrant | sin | cos // n = 0 -> sin( x ), cos( x ) // n = 1 -> cos( x ), -sin( x ) // n = 2 -> -sin( x ), -cos( x ) // n = 3 -> -sin( x ), sin( x ) // See ARGUMENT REDUCTION FOR HUGE ARGUMENTS: // Good to the Last Bit // K. C. Ng and themembers of the FP group of SunPro // http://www.derekroconnor.net/Software/Ng--ArgReduction.pdf // -- Perhaps we can leave this to GSoC students? -- // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayReal integralPart; x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI ); sincos_ps( x, &outSin, &outCos ); }
inline v_int32x4 v_round(const v_float32x4& a) { static const int32x4_t v_sign = vdupq_n_s32(1 << 31), v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val))); return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition)))); }
static void FilterFarNEON( int num_partitions, int x_fft_buf_block_pos, float x_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float h_fft_buf[2][kExtendedNumPartitions * PART_LEN1], float y_fft[2][PART_LEN1]) { int i; for (i = 0; i < num_partitions; i++) { int j; int xPos = (i + x_fft_buf_block_pos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap if (i + x_fft_buf_block_pos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t x_fft_buf_re = vld1q_f32(&x_fft_buf[0][xPos + j]); const float32x4_t x_fft_buf_im = vld1q_f32(&x_fft_buf[1][xPos + j]); const float32x4_t h_fft_buf_re = vld1q_f32(&h_fft_buf[0][pos + j]); const float32x4_t h_fft_buf_im = vld1q_f32(&h_fft_buf[1][pos + j]); const float32x4_t y_fft_re = vld1q_f32(&y_fft[0][j]); const float32x4_t y_fft_im = vld1q_f32(&y_fft[1][j]); const float32x4_t a = vmulq_f32(x_fft_buf_re, h_fft_buf_re); const float32x4_t e = vmlsq_f32(a, x_fft_buf_im, h_fft_buf_im); const float32x4_t c = vmulq_f32(x_fft_buf_re, h_fft_buf_im); const float32x4_t f = vmlaq_f32(c, x_fft_buf_im, h_fft_buf_re); const float32x4_t g = vaddq_f32(y_fft_re, e); const float32x4_t h = vaddq_f32(y_fft_im, f); vst1q_f32(&y_fft[0][j], g); vst1q_f32(&y_fft[1][j], h); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { y_fft[0][j] += MulRe(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); y_fft[1][j] += MulIm(x_fft_buf[0][xPos + j], x_fft_buf[1][xPos + j], h_fft_buf[0][pos + j], h_fft_buf[1][pos + j]); } } }
inline int32x4_t cv_vrndq_s32_f32(float32x4_t v) { static int32x4_t v_sign = vdupq_n_s32(1 << 31), v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v))); return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition))); }
/* Performs one rotation/translation */ static void neon_coord_4( float32x4_t a_4, float32x4_t b_4, float32x4_t x_4, float32x4_t y_4, float32x4_t pos_4f, float32x4_t point5_4, int * result) { float32x4_t tmp1 = vmulq_f32(a_4, x_4); float32x4_t tmp2 = vmulq_f32(b_4, y_4); tmp2 = vaddq_f32(tmp1, tmp2); tmp2 = vaddq_f32(tmp2, pos_4f); tmp2 = vaddq_f32(tmp2, point5_4); int32x4_t c_4 = vcvtq_s32_f32(tmp2); vst1q_s32(result, c_4); }
template <bool align> SIMD_INLINE void HogDirectionHistograms(const float32x4_t & dx, const float32x4_t & dy, Buffer & buffer, size_t col) { float32x4_t bestDot = vdupq_n_f32(0); int32x4_t bestIndex = vdupq_n_s32(0); for(int i = 0; i < buffer.size; ++i) { float32x4_t dot = vaddq_f32(vmulq_f32(dx, buffer.cos[i]), vmulq_f32(dy, buffer.sin[i])); uint32x4_t mask = vcgtq_f32(dot, bestDot); bestDot = vmaxq_f32(dot, bestDot); bestIndex = vbslq_s32(mask, buffer.pos[i], bestIndex); dot = vnegq_f32(dot); mask = vcgtq_f32(dot, bestDot); bestDot = vmaxq_f32(dot, bestDot); bestIndex = vbslq_s32(mask, buffer.neg[i], bestIndex); } Store<align>(buffer.index + col, bestIndex); Store<align>(buffer.value + col, Sqrt<SIMD_NEON_RCP_ITER>(vaddq_f32(vmulq_f32(dx, dx), vmulq_f32(dy, dy)))); }
template <bool align> SIMD_INLINE void SquaredDifferenceKahanSum32f(const float * a, const float * b, size_t offset, float32x4_t & sum, float32x4_t & correction) { float32x4_t _a = Load<align>(a + offset); float32x4_t _b = Load<align>(b + offset); float32x4_t _d = vsubq_f32(_a, _b); float32x4_t term = vmlaq_f32(correction, _d, _d); float32x4_t temp = vaddq_f32(sum, term); correction = vsubq_f32(vmulq_f32(temp, sum), term); sum = temp; }
static void FilterFarNEON(AecCore* aec, float yf[2][PART_LEN1]) { int i; const int num_partitions = aec->num_partitions; for (i = 0; i < num_partitions; i++) { int j; int xPos = (i + aec->xfBufBlockPos) * PART_LEN1; int pos = i * PART_LEN1; // Check for wrap if (i + aec->xfBufBlockPos >= num_partitions) { xPos -= num_partitions * PART_LEN1; } // vectorized code (four at once) for (j = 0; j + 3 < PART_LEN1; j += 4) { const float32x4_t xfBuf_re = vld1q_f32(&aec->xfBuf[0][xPos + j]); const float32x4_t xfBuf_im = vld1q_f32(&aec->xfBuf[1][xPos + j]); const float32x4_t wfBuf_re = vld1q_f32(&aec->wfBuf[0][pos + j]); const float32x4_t wfBuf_im = vld1q_f32(&aec->wfBuf[1][pos + j]); const float32x4_t yf_re = vld1q_f32(&yf[0][j]); const float32x4_t yf_im = vld1q_f32(&yf[1][j]); const float32x4_t a = vmulq_f32(xfBuf_re, wfBuf_re); const float32x4_t e = vmlsq_f32(a, xfBuf_im, wfBuf_im); const float32x4_t c = vmulq_f32(xfBuf_re, wfBuf_im); const float32x4_t f = vmlaq_f32(c, xfBuf_im, wfBuf_re); const float32x4_t g = vaddq_f32(yf_re, e); const float32x4_t h = vaddq_f32(yf_im, f); vst1q_f32(&yf[0][j], g); vst1q_f32(&yf[1][j], h); } // scalar code for the remaining items. for (; j < PART_LEN1; j++) { yf[0][j] += MulRe(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j], aec->wfBuf[0][pos + j], aec->wfBuf[1][pos + j]); yf[1][j] += MulIm(aec->xfBuf[0][xPos + j], aec->xfBuf[1][xPos + j], aec->wfBuf[0][pos + j], aec->wfBuf[1][pos + j]); } } }
//----------------------------------------------------------------------------------- ArrayReal MathlibNEON::Cos4( ArrayReal x ) { // Map arbitrary angle x to the range [-pi; +pi] without using division. // Code taken from MSDN's HLSL trick. Architectures with fused mad (i.e. NEON) // can replace the add, the sub, & the two muls for two mad ArrayReal integralPart; x = vaddq_f32( vmulq_f32( x, ONE_DIV_2PI ), HALF ); x = Modf4( x, integralPart ); x = vsubq_f32( vmulq_f32( x, TWO_PI ), PI ); return cos_ps( x ); }
void dotProd_neon(const float *data, const float *weights, float *vals, const int n, const int len, const float *istd) { for (int i = 0; i < n; i += 4) { float32x4_t accum0 = { 0.0f, 0.0f, 0.0f, 0.0f }; float32x4_t accum1 = accum0; float32x4_t accum2 = accum0; float32x4_t accum3 = accum0; for (int j = 0; j < len; j += 4) { float32x4_t d0 = vld1q_f32(data + j); float32x4_t d1 = d0; float32x4_t d2 = d0; float32x4_t d3 = d0; float32x4_t w0 = vld1q_f32(weights); float32x4_t w1 = vld1q_f32(weights + 4); float32x4_t w2 = vld1q_f32(weights + 8); float32x4_t w3 = vld1q_f32(weights + 12); accum0 = vaddq_f32(accum0, vmulq_f32(d0, w0)); accum1 = vaddq_f32(accum1, vmulq_f32(d1, w1)); accum2 = vaddq_f32(accum2, vmulq_f32(d2, w2)); accum3 = vaddq_f32(accum3, vmulq_f32(d3, w3)); weights += 16; } float32x2_t sum0 = vpadd_f32(vget_low_f32(accum0), vget_high_f32(accum0)); float32x2_t sum1 = vpadd_f32(vget_low_f32(accum1), vget_high_f32(accum1)); float32x2_t sum2 = vpadd_f32(vget_low_f32(accum2), vget_high_f32(accum2)); float32x2_t sum3 = vpadd_f32(vget_low_f32(accum3), vget_high_f32(accum3)); sum0 = vpadd_f32(sum0, sum1); sum1 = vpadd_f32(sum2, sum3); float32x4_t sum = vcombine_f32(sum0, sum1); sum = vmulq_n_f32(sum, istd[0]); sum = vaddq_f32(sum, vld1q_f32(weights + n*len + i)); vst1q_f32(vals + i, sum); } }
int Bias_arm::forward(const Mat& bottom_blob, Mat& top_blob) const { int w = bottom_blob.w; int h = bottom_blob.h; int channels = bottom_blob.c; int size = w * h; top_blob.create(w, h, channels); if (top_blob.empty()) return -100; const float* bias_ptr = bias_data; #pragma omp parallel for for (int q=0; q<channels; q++) { const float* ptr = bottom_blob.channel(q); float* outptr = top_blob.channel(q); float bias = bias_ptr[q]; #if __ARM_NEON int nn = size >> 2; int remain = size - (nn << 2); #else int remain = size; #endif // __ARM_NEON #if __ARM_NEON float32x4_t _bias = vdupq_n_f32(bias); for (; nn>0; nn--) { float32x4_t _p = vld1q_f32(ptr); float32x4_t _outp = vaddq_f32(_p, _bias); vst1q_f32(outptr, _outp); ptr += 4; outptr += 4; } #endif // __ARM_NEON for (; remain>0; remain--) { *outptr = *ptr + bias; ptr++; outptr++; } } return 0; }
/* f32x4 mv mul */ void mw_neon_mv_mul_f32x4(float * A, int Row, int T, float * B, float * C) { int i = 0; int k = 0; float32x4_t neon_b, neon_c; float32x4_t neon_a0, neon_a1, neon_a2, neon_a3; float32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_f32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_f32(A + j); neon_a1 = vld1q_f32(A + j + Row); neon_a2 = vld1q_f32(A + j + 2 * Row); neon_a3 = vld1q_f32(A + j + 3 * Row); neon_b = vld1q_f32(B + k); neon_b0 = vdupq_n_f32(vgetq_lane_f32(neon_b, 0)); neon_b1 = vdupq_n_f32(vgetq_lane_f32(neon_b, 1)); neon_b2 = vdupq_n_f32(vgetq_lane_f32(neon_b, 2)); neon_b3 = vdupq_n_f32(vgetq_lane_f32(neon_b, 3)); neon_c = vaddq_f32(vmulq_f32(neon_a0, neon_b0), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a1, neon_b1), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a2, neon_b2), neon_c); neon_c = vaddq_f32(vmulq_f32(neon_a3, neon_b3), neon_c); } vst1q_f32(C + i, neon_c); } }
void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) { const int16_t *data = (const int16_t *)dataf; const int16_t *weights = (const int16_t *)weightsf; weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t) for (int i = 0; i < n; i += 4) { int32x4_t accum0 = { 0, 0, 0, 0 }; int32x4_t accum1 = accum0; int32x4_t accum2 = accum0; int32x4_t accum3 = accum0; for (int j = 0; j < len; j += 8) { int16x4x2_t d0 = vld2_s16(data + j); int16x4x2_t w0 = vld2_s16(weights); int16x4x2_t w1 = vld2_s16(weights + 8); int16x4x2_t w2 = vld2_s16(weights + 16); int16x4x2_t w3 = vld2_s16(weights + 24); accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]); accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]); accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]); accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]); accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]); accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]); accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]); accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]); weights += 32; } int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0)); int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1)); int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2)); int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3)); sum0 = vpadd_s32(sum0, sum1); sum1 = vpadd_s32(sum2, sum3); int32x4_t sum = vcombine_s32(sum0, sum1); float32x4_t val = vcvtq_f32_s32(sum); val = vmulq_f32(val, vld1q_f32(weightsf + i*2)); val = vmulq_n_f32(val, istd[0]); val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4)); vst1q_f32(vals + i, val); } }
//Kernel function: saxpy void saxpy_vector(KernelArgs* args) { //Setup const float32x4_t MASK_FALSE = vdupq_n_f32(0.f); const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE)); //Uniforms //Fuses //Literals //Stack variables float32x4_t scale, x, y, result, var060, var061; //Loop over input uint64_t index; for(index = 0; index < args->N; index += 4) { //Inputs scale = vld1q_f32(&args->scale[index]); x = vld1q_f32(&args->x[index]); y = vld1q_f32(&args->y[index]); //Begin kernel logic { //>>> result = scale * x + y var061 = vmulq_f32(scale, x); var060 = vaddq_f32(var061, y); result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result); } //End kernel logic //Outputs vst1q_f32(&args->result[index], result); } }
/* f32x4 add */ void mw_neon_mm_add_f32x4(float * A, int Row, int Col, float * B, float * C) { float32x4_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 4; i <= size ; i+=4) { k = i - 4; neon_a = vld1q_f32(A + k); neon_b = vld1q_f32(B + k); neon_c = vaddq_f32(neon_a, neon_b); vst1q_f32(C + k, neon_c); } k = i - 4; for (i = 0; i < size % 4; i++) { C[k + i] = A[k + i] + B[k + i]; } }
// __INLINE void arm_cmplx_mult_cmplx_f32_dot( float32_t * pSrcA, float32_t * pSrcB, float32_t * pDst, uint32_t numSamples) { float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */ float32x4_t A1, A2; /* Temporary variables to store real and imaginary values of source buffer A */ float32x4_t B1, B2; /* Temporary variables to store real and imaginary values of source buffer B */ float32x4_t C1, C2, C3, C4; /* Temporary variables to store multiplication output */ float32x4x2_t out1, out2, out3, out4; /* Temporary variables to stroe output result */ float32x4x2_t acc1, acc2, acc3, acc4; /* Accumulators */ float sum_real, sum_img; /* */ uint32_t blkCnt; /* loop counters */ /* Clear accumulators VDUP.32 q0,r0 Vector Duplicate duplicates a scalar into every element of the destination vector. */ acc1.val[0] = vdupq_n_f32(0.0f); acc1.val[1] = vdupq_n_f32(0.0f); acc2.val[0] = vdupq_n_f32(0.0f); acc2.val[1] = vdupq_n_f32(0.0f); acc3.val[0] = vdupq_n_f32(0.0f); acc3.val[1] = vdupq_n_f32(0.0f); acc4.val[0] = vdupq_n_f32(0.0f); acc4.val[1] = vdupq_n_f32(0.0f); /* Loop over blockSize number of values */ blkCnt = numSamples >> 4u; while(blkCnt > 0u) { /* A1, A2, B1, B2 each has two complex data. */ /******************************************************/ /* Step 1: Load data A1, A2, B1, B2 for group a:*/ /* read 2 complex values at a time from source A buffer float32x4_t vld1q_f32(__transfersize(4) float32_t const * ptr); VLD1.32 {d0, d1}, [r0] */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source A buffer */ A2 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /* read 2 complex values at a time from source B buffer */ B2 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /******************************************************/ /* Step 2: Unzip data Out1, Out2 for group a:*/ /* unzip real and imag values A1: reala0, imga0, reala1, imga1 A2: realb0, imgb0, realb1, imgb1 out1.val0: reala0, reala1, realb0, realb1; out1.val1: imga0, imga1, imgb0, imgb1 vuzpq_f32: float32x4x2_t vuzpq_f32 (float32x4_t, float32x4_t) Form of expected instruction(s): vuzp.32 q0, q1 Vector Unzip de-interleaves the elements of two vectors. */ out1 = vuzpq_f32(A1, A2); out2 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 1: Load data A1, A2, B1, B2 for group b:*/ /* read 2 complex values at a time from source A buffer */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source A buffer */ A2 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /* read 2 complex values at a time from source B buffer */ B2 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /******************************************************/ /* Step 3: Compute data C1,C2,C3,C4 for group a:*/ /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ /* vmulq_f32: VMUL.F32 q0,q0,q0 val[0]: real val[1]: img C1 = a.real*b.real; C2 = a.img*b.img C3 = a.img*b.real; C4 = a.real*b.img */ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out1.val[0], out2.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out1.val[1], out2.val[1]); /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out1.val[1], out2.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out1.val[0], out2.val[1]); /* real: c1-c2; img: c3+c4 */ /******************************************************/ /* Step 2: Unzip data Out2, Out3 for group b:*/ out2 = vuzpq_f32(A1, A2); out3 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 1: Load data A1, A2 for group c:*/ /* read 2 complex values at a time from source A buffer */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /* read 2 complex values at a time from source A buffer */ A2 = vld1q_f32(pSrcA); /* increment source A buffer by 4 */ pSrcA += 4u; /******************************************************/ /* Step 4: Output or accumlate data for group a:*/ /* (a+bi)*(c+di) = (ac-bd)+(ad+bc)i*/ /* real: c1-c2; img: c3+c4 */ /* subtract 4 samples at time from real result to imaginary result, got four real part */ /* C1 = a.real*b.real; C2 = a.img*b.img C3 = a.img*b.real; C4 = a.real*b.img vaddq_f32: VADD.F32 q0,q0,q0 */ out1.val[0] = vsubq_f32(C1, C2); acc1.val[0] = vaddq_f32(out1.val[0], acc1.val[0]); /* add by Hank */ /* add real*imaginary result with imaginary*real result 4 at a time */ out1.val[1] = vaddq_f32(C3, C4); acc1.val[1] = vaddq_f32(out1.val[1], acc1.val[1]); /* add by Hank */ /* out1 is four complex product. */ /******************************************************/ /* Step 1: Load data B1, B2 for group c:*/ /* read 2 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /* read 2 complex values at a time from source B buffer */ B2 = vld1q_f32(pSrcB); /* increment source B buffer by 4 */ pSrcB += 4u; /******************************************************/ /* Step 3: Compute data C1,C2 for group b:*/ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out2.val[0], out3.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out2.val[1], out3.val[1]); /******************************************************/ /* Step 5: Store data for group a:*/ /* Store 4 complex samples to destination buffer VST2.32 {d0, d2}, [r0] */ //vst2q_f32(pDst, out1); /* increment destination buffer by 8 */ //pDst += 8u; /******************************************************/ /* Step 3: Compute data C3,C4 for group b:*/ /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out2.val[1], out3.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out2.val[0], out3.val[1]); /******************************************************/ /* Step 2: Unzip data Out1, Out2 for group C:*/ out3 = vuzpq_f32(A1, A2); out4 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 1: Load data A1, A2, B1, B2 for group d:*/ /* read 4 complex values from source A buffer */ A1 = vld1q_f32(pSrcA); pSrcA += 4u; A2 = vld1q_f32(pSrcA); pSrcA += 4u; /* read 4 complex values from source B buffer */ B1 = vld1q_f32(pSrcB); pSrcB += 4u; B2 = vld1q_f32(pSrcB); pSrcB += 4u; /******************************************************/ /* Step 4: Output or accumlate data for group b:*/ /* subtract 4 samples at time from real result to imaginary result */ out2.val[0] = vsubq_f32(C1, C2); /* add real*imaginary result with imaginary*real result 4 at a time */ out2.val[1] = vaddq_f32(C3, C4); acc2.val[0] = vaddq_f32(out2.val[0], acc2.val[0]); /* add by Hank */ acc2.val[1] = vaddq_f32(out2.val[1], acc2.val[1]); /* add by Hank */ /******************************************************/ /* Step 3: Compute data C1,C2,C3,C4 for group c:*/ /* multiply 4 samples at a time from A3 real input with B3 real input */ C1 = vmulq_f32(out3.val[0], out4.val[0]); /* multiply 4 samples at a time from A3 imaginary input with B3 imaginary input */ C2 = vmulq_f32(out3.val[1], out4.val[1]); /* multiply 4 samples at a time from A3 imaginary input with B3 real input */ C3 = vmulq_f32(out3.val[1], out4.val[0]); /* multiply 4 samples at a time from A3 real input with B3 imaginary input */ C4 = vmulq_f32(out3.val[0], out4.val[1]); /******************************************************/ /* Step 2: Unzip data Out1, Out2 for group D:*/ out1 = vuzpq_f32(A1, A2); out4 = vuzpq_f32(B1, B2); /******************************************************/ /* Step 5: Store data for group b:*/ /* Store 4 complex samples to destination buffer */ //vst2q_f32(pDst, out2); /* increment destination buffer by 8 */ //pDst += 8u; /******************************************************/ /* Step 4: Output or accumlate data for group c:*/ /* subtract 4 samples at time from real result to imaginary result */ out3.val[0] = vsubq_f32(C1, C2); /* add real*imaginary result with imaginary*real result 4 at a time */ out3.val[1] = vaddq_f32(C3, C4); acc3.val[0] = vaddq_f32(out3.val[0], acc3.val[0]); /* add by Hank */ acc3.val[1] = vaddq_f32(out3.val[1], acc3.val[1]); /* add by Hank */ /******************************************************/ /* Step 3: Compute data C1,C2,C3,C4 for group d:*/ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out1.val[0], out4.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out1.val[1], out4.val[1]); /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out1.val[1], out4.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out1.val[0], out4.val[1]); /******************************************************/ /* Step 5: Store data for group c:*/ /* Store 4 complex samples to destination buffer */ //vst2q_f32(pDst, out3); /******************************************************/ /* Step 4: Output or accumlate data for group d:*/ /* subtract 4 samples at time from real result to imaginary result */ out4.val[0] = vsubq_f32(C1, C2); /* increment destination buffer by 8 */ //pDst += 8u; /******************************************************/ /* Step 4: Output or accumlate data for group d:*/ /* add real*imaginary result with imaginary*real result 4 at a time */ out4.val[1] = vaddq_f32(C3, C4); acc4.val[0] = vaddq_f32(out4.val[0], acc4.val[0]); /* add by Hank */ acc4.val[1] = vaddq_f32(out4.val[1], acc4.val[1]); /* add by Hank */ /* zip real and imag values */ //out4 = vzipq_f32(out4.val[0], out4.val[1]); /******************************************************/ /* Step 5: Store data for group d:*/ /* Store 4 complex samples to destination buffer */ //vst1q_f32(pDst, out4.val[0]); //pDst += 4u; //vst1q_f32(pDst, out4.val[1]); //pDst += 4u; /* Decrement the numSamples loop counter */ blkCnt--; } blkCnt = numSamples & 15u; blkCnt = blkCnt >> 2u; /* If the blockSize is not a multiple of 16, compute remaining output samples. ** Compute multiple of 4 samples at a time in second loop. ** and remaining 1 to 3 samples in third loop. */ while(blkCnt > 0u) { /* Step 1: Load data A1, A2, B1, B2 */ /* read 4 complex values at a time from source A buffer */ A1 = vld1q_f32(pSrcA); /* increment source A buffer by 8 */ pSrcA += 4u; A2 = vld1q_f32(pSrcA); pSrcA += 4u; /* read 4 complex values at a time from source B buffer */ B1 = vld1q_f32(pSrcB); /* increment source B buffer by 8 */ pSrcB += 4u; B2 = vld1q_f32(pSrcB); pSrcB += 4u; /* Step 2: Unzip data Out1, Out2 */ /* Unzip data */ out1 = vuzpq_f32(A1, A2); out2 = vuzpq_f32(B1, B2); /* Step 3: Compute data C1,C2,C3,C4 */ /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ /* multiply 4 samples at a time from A1 real input with B1 real input */ C1 = vmulq_f32(out1.val[0], out2.val[0]); /* multiply 4 samples at a time from A1 imaginary input with B1 imaginary input */ C2 = vmulq_f32(out1.val[1], out2.val[1]); /* multiply 4 samples at a time from A1 imaginary input with B1 real input */ C3 = vmulq_f32(out1.val[1], out2.val[0]); /* multiply 4 samples at a time from A1 real input with B1 imaginary input */ C4 = vmulq_f32(out1.val[0], out2.val[1]); /* Step 4: Output or accumlate data for group d:*/ /* subtract 4 samples at time from real result to imaginary result */ out1.val[0] = vsubq_f32(C1, C2); /* add real*imaginary result with imaginary*real result 4 at a time */ out1.val[1] = vaddq_f32(C3, C4); acc1.val[0] = vaddq_f32(out1.val[0], acc1.val[0]); /* add by Hank */ acc1.val[1] = vaddq_f32(out1.val[1], acc1.val[1]); /* add by Hank */ //out1 = vzipq_f32(out1.val[0], out1.val[1]); /* Step 5: Store data */ /* Store 4 complex samples to destination buffer */ //vst1q_f32(pDst, out1.val[0]); //pDst += 4u; //vst1q_f32(pDst, out1.val[1]); //pDst += 4u; /* Decrement the numSamples loop counter */ blkCnt--; } blkCnt = numSamples & 3u; /* If the blockSize is not a multiple of 4, compute any remaining output samples here. ** No intrinsics is used. */ sum_real =0; sum_img =0; while(blkCnt > 0u) { /* C[2 * i] = A[2 * i] * B[2 * i] - A[2 * i + 1] * B[2 * i + 1]. */ /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i]. */ a = *pSrcA++; b = *pSrcA++; c = *pSrcB++; d = *pSrcB++; /* store the result in the destination buffer. */ sum_real += ((a * c) - (b * d)); sum_img += ((a * d) + (b * c)); /* Decrement the numSamples loop counter */ blkCnt--; } /* add 4 accumulators */ acc1.val[0] = vaddq_f32(acc1.val[0], acc2.val[0]); acc1.val[1] = vaddq_f32(acc1.val[1], acc2.val[1]); acc2.val[0] = vaddq_f32(acc3.val[0], acc4.val[0]); acc2.val[1] = vaddq_f32(acc3.val[1], acc4.val[1]); acc1.val[0] = vaddq_f32(acc1.val[0], acc2.val[0]); acc1.val[1] = vaddq_f32(acc1.val[1], acc2.val[1]); sum_real += vgetq_lane_f32(acc1.val[0], 0) + vgetq_lane_f32(acc1.val[0], 1) + vgetq_lane_f32(acc1.val[0], 2) + vgetq_lane_f32(acc1.val[0], 3); sum_img += vgetq_lane_f32(acc1.val[1], 0) + vgetq_lane_f32(acc1.val[1], 1) + vgetq_lane_f32(acc1.val[1], 2) + vgetq_lane_f32(acc1.val[1], 3); *pDst++=sum_real; *pDst++=sum_img;
inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v) { static float32x4_t v_05 = vdupq_n_f32(0.5f); return vcvtq_u32_f32(vaddq_f32(v, v_05)); }
// use ARM Neon extensions (unrolled loop) // NOTE: unrolling doesn't show any appreciable performance difference void dotprod_cccf_execute_neon4(dotprod_cccf _q, float complex * _x, float complex * _y) { // type cast input as floating point array float * x = (float*) _x; // double effective length unsigned int n = 2*_q->n; // first cut: ... float32x4_t v0, v1, v2, v3; // input vectors float32x4_t hi0, hi1, hi2, hi3; // coefficients vectors (real) float32x4_t hq0, hq1, hq2, hq3; // coefficients vectors (imag) float32x4_t ci0, ci1, ci2, ci3; // output multiplications (v * hi) float32x4_t cq0, cq1, cq2, cq3; // output multiplications (v * hq) // load zeros into sum registers float zeros[4] = {0,0,0,0}; float32x4_t sumi = vld1q_f32(zeros); float32x4_t sumq = vld1q_f32(zeros); // r = 4*floor(n/16) unsigned int r = (n >> 4) << 2; // unsigned int i; for (i=0; i<r; i+=4) { // load inputs into register (unaligned) v0 = vld1q_f32(&x[4*i+0]); v1 = vld1q_f32(&x[4*i+4]); v2 = vld1q_f32(&x[4*i+8]); v3 = vld1q_f32(&x[4*i+12]); // load real coefficients into registers (aligned) hi0 = vld1q_f32(&_q->hi[4*i+0]); hi1 = vld1q_f32(&_q->hi[4*i+4]); hi2 = vld1q_f32(&_q->hi[4*i+8]); hi3 = vld1q_f32(&_q->hi[4*i+12]); // load real coefficients into registers (aligned) hq0 = vld1q_f32(&_q->hq[4*i+0]); hq1 = vld1q_f32(&_q->hq[4*i+4]); hq2 = vld1q_f32(&_q->hq[4*i+8]); hq3 = vld1q_f32(&_q->hq[4*i+12]); // compute parallel multiplications (real) ci0 = vmulq_f32(v0, hi0); ci1 = vmulq_f32(v1, hi1); ci2 = vmulq_f32(v2, hi2); ci3 = vmulq_f32(v3, hi3); // compute parallel multiplications (imag) cq0 = vmulq_f32(v0, hq0); cq1 = vmulq_f32(v1, hq1); cq2 = vmulq_f32(v2, hq2); cq3 = vmulq_f32(v3, hq3); // accumulate sumi = vaddq_f32(sumi, ci0); sumq = vaddq_f32(sumq, cq0); sumi = vaddq_f32(sumi, ci1); sumq = vaddq_f32(sumq, cq1); sumi = vaddq_f32(sumi, ci2); sumq = vaddq_f32(sumq, cq2); sumi = vaddq_f32(sumi, ci3); sumq = vaddq_f32(sumq, cq3); } // unload float wi[4]; float wq[4]; vst1q_f32(wi, sumi); vst1q_f32(wq, sumq); // fold down (add/sub) float complex total = ((wi[0] - wq[1]) + (wi[2] - wq[3])) + ((wi[1] + wq[0]) + (wi[3] + wq[2])) * _Complex_I; // cleanup (note: n _must_ be even) // TODO : clean this method up for (i=2*r; i<_q->n; i++) { total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I ); } // set return value *_y = total; }
// use ARM Neon extensions // // (a + jb)(c + jd) = (ac - bd) + j(ad + bc) // // mm_x = { x[0].real, x[0].imag, x[1].real, x[1].imag } // mm_hi = { h[0].real, h[0].real, h[1].real, h[1].real } // mm_hq = { h[0].imag, h[0].imag, h[1].imag, h[1].imag } // // mm_y0 = mm_x * mm_hi // = { x[0].real * h[0].real, // x[0].imag * h[0].real, // x[1].real * h[1].real, // x[1].imag * h[1].real }; // // mm_y1 = mm_x * mm_hq // = { x[0].real * h[0].imag, // x[0].imag * h[0].imag, // x[1].real * h[1].imag, // x[1].imag * h[1].imag }; // void dotprod_cccf_execute_neon(dotprod_cccf _q, float complex * _x, float complex * _y) { // type cast input as floating point array float * x = (float*) _x; // double effective length unsigned int n = 2*_q->n; // temporary buffers float32x4_t v; // input vector float32x4_t hi; // coefficients vector (real) float32x4_t hq; // coefficients vector (imag) float32x4_t ci; // output multiplication (v * hi) float32x4_t cq; // output multiplication (v * hq) // output accumulators float zeros[4] = {0,0,0,0}; float32x4_t sumi = vld1q_f32(zeros); float32x4_t sumq = vld1q_f32(zeros); // t = 4*(floor(_n/4)) unsigned int t = (n >> 2) << 2; // unsigned int i; for (i=0; i<t; i+=4) { // load inputs into register (unaligned) // {x[0].real, x[0].imag, x[1].real, x[1].imag} v = vld1q_f32(&x[i]); // load coefficients into register (aligned) // {hi[0].real, hi[0].imag, hi[1].real, hi[1].imag} // {hq[0].real, hq[0].imag, hq[1].real, hq[1].imag} hi = vld1q_f32(&_q->hi[i]); hq = vld1q_f32(&_q->hq[i]); // compute parallel multiplications ci = vmulq_f32(v, hi); cq = vmulq_f32(v, hq); // parallel addition sumi = vaddq_f32(sumi, ci); sumq = vaddq_f32(sumq, cq); } // unload and combine float wi[4]; float wq[4]; vst1q_f32(wi, sumi); vst1q_f32(wq, sumq); // fold down (add/sub) float complex total = ((wi[0] - wq[1]) + (wi[2] - wq[3])) + ((wi[1] + wq[0]) + (wi[3] + wq[2])) * _Complex_I; // cleanup for (i=t/2; i<_q->n; i++) total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I ); // set return value *_y = total; }
int main (int argc, char **argv) { int c = 0; int i = 0; int j = 0; uint num_loops = 0; bool interrupt_flag = false; uint number_samples = 0; uint decim_rate = 0; uint fft_size = 0; float threshold = 0.0; double gain = 0.0; int threshold_exceeded = 0; float threshold_exceeded_mag = 0.0; int threshold_exceeded_index = 0; uint32_t start_decision; uint32_t stop_decision; uint32_t start_sensing; uint32_t stop_sensing; uint32_t start_overhead; uint32_t stop_overhead; uint32_t start_dma; uint32_t stop_dma; float dma_time[30]; float sensing_time[30]; float decision_time[30]; float32x4_t floats_real; float32x4_t floats_imag; float32x4_t floats_real_sqr; float32x4_t floats_imag_sqr; float32x4_t floats_add; float32x4_t floats_sqroot; float32x4_t thresholds; uint32x4_t compares; uint32_t decisions[4096]; fftwf_complex *in1; fftwf_complex out[8192]; // Must be 2x max FFT size fftwf_plan p1; struct crash_plblock *usrp_intf_tx; struct crash_plblock *usrp_intf_rx; // Parse command line arguments while (1) { static struct option long_options[] = { /* These options don't set a flag. We distinguish them by their indices. */ {"interrupt", no_argument, 0, 'i'}, {"loop prog", no_argument, 0, 'l'}, {"decim", required_argument, 0, 'd'}, {"fft size", required_argument, 0, 'k'}, {"threshold", required_argument, 0, 't'}, {0, 0, 0, 0} }; /* getopt_long stores the option index here. */ int option_index = 0; // 'n' is the short option, ':' means it requires an argument c = getopt_long (argc, argv, "ild:k:t:", long_options, &option_index); /* Detect the end of the options. */ if (c == -1) break; switch (c) { case 'i': interrupt_flag = true; break; case 'l': loop_prog = 1; break; case 'd': decim_rate = atoi(optarg); break; case 'k': fft_size = (uint)ceil(log2((double)atoi(optarg))); break; case 't': threshold = atof(optarg); break; case '?': /* getopt_long already printed an error message. */ break; default: abort (); } } /* Print any remaining command line arguments (not options). */ if (optind < argc) { printf ("Invalid options:\n"); while (optind < argc) { printf ("\t%s\n", argv[optind++]); } return -1; } if (decim_rate == 0) { printf("INFO: Decimation rate not specified, defaulting to 1\n"); decim_rate = 1; } if (decim_rate > 2047) { printf("ERROR: Decimation rate too high\n"); return -1; } if (fft_size == 0) { printf("INFO: FFT size not specified, defaulting to 256\n"); fft_size = 8; } // FFT size cannot be greater than 4096 or less than 64 if (fft_size > 13 || fft_size < 6) { printf("ERROR: FFT size cannot be greater than 4096 or less than 64\n"); return -1; } if (threshold == 0.0) { printf("INFO: Threshold not set, default to 1.0\n"); threshold = 1.0; } number_samples = (uint)pow(2.0,(double)fft_size); // Set Ctrl-C handler signal(SIGINT, ctrl_c); // Set this process to be real time //struct sched_param param; //param.sched_priority = 99; //if (sched_setscheduler(0, SCHED_FIFO, & param) != 0) { // perror("sched_setscheduler"); // exit(EXIT_FAILURE); //} usrp_intf_tx = crash_open(USRP_INTF_PLBLOCK_ID,WRITE); if (usrp_intf_tx == 0) { printf("ERROR: Failed to allocate usrp_intf_tx plblock\n"); return -1; } usrp_intf_rx = crash_open(USRP_INTF_PLBLOCK_ID,READ); if (usrp_intf_rx == 0) { crash_close(usrp_intf_rx); printf("ERROR: Failed to allocate usrp_intf_rx plblock\n"); return -1; } in1 = (fftw_complex *)(usrp_intf_rx->dma_buff); start_overhead = crash_read_reg(usrp_intf_tx->regs,DMA_DEBUG_CNT); stop_overhead = crash_read_reg(usrp_intf_tx->regs,DMA_DEBUG_CNT); printf("Overhead (us): %f\n",(1e6/150e6)*(stop_overhead - start_overhead)); do { // Set threshold for NEON instruction thresholds[0] = threshold; thresholds[1] = threshold; thresholds[2] = threshold; thresholds[3] = threshold; // Setup FFTW3 p1 = fftwf_plan_dft_1d(fft_size, in1, out, FFTW_FORWARD, FFTW_ESTIMATE); // Global Reset to get us to a clean slate crash_reset(usrp_intf_tx); if (interrupt_flag == true) { crash_set_bit(usrp_intf_tx->regs,DMA_MM2S_INTERRUPT); } // Wait for USRP DDR interface to finish calibrating (due to reset). This is necessary // as the next steps recalibrate the interface and are ignored if issued while it is // currently calibrating. while(!crash_get_bit(usrp_intf_tx->regs,USRP_RX_CAL_COMPLETE)); while(!crash_get_bit(usrp_intf_tx->regs,USRP_TX_CAL_COMPLETE)); // Set RX phase crash_write_reg(usrp_intf_tx->regs,USRP_RX_PHASE_INIT,RX_PHASE_CAL); crash_set_bit(usrp_intf_tx->regs,USRP_RX_RESET_CAL); //printf("RX PHASE INIT: %d\n",crash_read_reg(usrp_intf_tx->regs,USRP_RX_PHASE_INIT)); while(!crash_get_bit(usrp_intf_tx->regs,USRP_RX_CAL_COMPLETE)); // Set TX phase crash_write_reg(usrp_intf_tx->regs,USRP_TX_PHASE_INIT,TX_PHASE_CAL); crash_set_bit(usrp_intf_tx->regs,USRP_TX_RESET_CAL); //printf("TX PHASE INIT: %d\n",crash_read_reg(usrp_intf_tx->regs,USRP_TX_PHASE_INIT)); while(!crash_get_bit(usrp_intf_tx->regs,USRP_TX_CAL_COMPLETE)); // Set USRP TX / RX Modes while(crash_get_bit(usrp_intf_tx->regs,USRP_UART_BUSY)); crash_write_reg(usrp_intf_tx->regs,USRP_USRP_MODE_CTRL,CMD_TX_MODE + TX_DAC_RAW_MODE); while(crash_get_bit(usrp_intf_tx->regs,USRP_UART_BUSY)); while(crash_get_bit(usrp_intf_tx->regs,USRP_UART_BUSY)); crash_write_reg(usrp_intf_tx->regs,USRP_USRP_MODE_CTRL,CMD_RX_MODE + RX_ADC_DSP_MODE); while(crash_get_bit(usrp_intf_tx->regs,USRP_UART_BUSY)); // Setup RX path crash_set_bit(usrp_intf_tx->regs, USRP_RX_FIFO_BYPASS); // Bypass RX FIFO so stale data in the FIFO does not cause latency crash_write_reg(usrp_intf_tx->regs, USRP_AXIS_MASTER_TDEST, DMA_PLBLOCK_ID); // Set tdest to spec_sense crash_write_reg(usrp_intf_tx->regs, USRP_RX_PACKET_SIZE, number_samples); // Set packet size crash_clear_bit(usrp_intf_tx->regs, USRP_RX_FIX2FLOAT_BYPASS); // Do not bypass fix2float if (decim_rate == 1) { crash_set_bit(usrp_intf_tx->regs, USRP_RX_CIC_BYPASS); // Bypass CIC Filter crash_set_bit(usrp_intf_tx->regs, USRP_RX_HB_BYPASS); // Bypass HB Filter crash_write_reg(usrp_intf_tx->regs, USRP_RX_GAIN, 1); // Set gain = 1 } else if (decim_rate == 2) { crash_set_bit(usrp_intf_tx->regs, USRP_RX_CIC_BYPASS); // Bypass CIC Filter crash_clear_bit(usrp_intf_tx->regs, USRP_RX_HB_BYPASS); // Enable HB Filter crash_write_reg(usrp_intf_tx->regs, USRP_RX_GAIN, 1); // Set gain = 1 // Even, use both CIC and Halfband filters } else if ((decim_rate % 2) == 0) { crash_clear_bit(usrp_intf_tx->regs, USRP_RX_CIC_BYPASS); // Enable CIC Filter crash_write_reg(usrp_intf_tx->regs, USRP_RX_CIC_DECIM, decim_rate/2); // Set CIC decimation rate (div by 2 as we are using HB filter) crash_clear_bit(usrp_intf_tx->regs, USRP_RX_HB_BYPASS); // Enable HB Filter // Offset CIC bit growth. A 32-bit multiplier in the receive chain allows us // to scale the CIC output. gain = 26.0-3.0*log2(decim_rate/2); gain = (gain > 1.0) ? (ceil(pow(2.0,gain))) : (1.0); // Do not allow gain to be set to 0 crash_write_reg(usrp_intf_tx->regs, USRP_RX_GAIN, (uint32_t)gain); // Set gain // Odd, use only CIC filter } else { crash_clear_bit(usrp_intf_tx->regs, USRP_RX_CIC_BYPASS); // Enable CIC Filter crash_write_reg(usrp_intf_tx->regs, USRP_RX_CIC_DECIM, decim_rate); // Set CIC decimation rate crash_set_bit(usrp_intf_tx->regs, USRP_RX_HB_BYPASS); // Bypass HB Filter // gain = 26.0-3.0*log2(decim_rate); gain = (gain > 1.0) ? (ceil(pow(2.0,gain))) : (1.0); // Do not allow gain to be set to 0 crash_write_reg(usrp_intf_tx->regs, USRP_RX_GAIN, (uint32_t)gain); // Set gain } // Setup TX path crash_clear_bit(usrp_intf_tx->regs, USRP_TX_FIX2FLOAT_BYPASS); // Do not bypass fix2float crash_set_bit(usrp_intf_tx->regs, USRP_TX_CIC_BYPASS); // Bypass CIC Filter crash_set_bit(usrp_intf_tx->regs, USRP_TX_HB_BYPASS); // Bypass HB Filter crash_write_reg(usrp_intf_tx->regs, USRP_TX_GAIN, 1); // Set gain = 1 // Create a CW signal to transmit float *tx_sample = (float*)(usrp_intf_tx->dma_buff); for (i = 0; i < 4095; i++) { tx_sample[2*i+1] = 0; tx_sample[2*i] = 0.5; } tx_sample[2*4095+1] = 0; tx_sample[2*4095] = 0; // Load waveform into TX FIFO so it can immediately trigger crash_write(usrp_intf_tx, USRP_INTF_PLBLOCK_ID, number_samples); crash_set_bit(usrp_intf_tx->regs,USRP_RX_ENABLE); // Enable RX // First, loop until threshold is exceeded j = 0; while (threshold_exceeded == 0) { crash_read(usrp_intf_rx, USRP_INTF_PLBLOCK_ID, number_samples); // Run FFT fftwf_execute(p1); for (i = 0; i < number_samples/4; i++) { // Calculate sqrt(I^2 + Q^2) floats_real[0] = out[4*i][0]; floats_real[1] = out[4*i+1][0]; floats_real[2] = out[4*i+2][0]; floats_real[3] = out[4*i+3][0]; floats_real_sqr = vmulq_f32(floats_real, floats_real); floats_imag[0] = out[4*i][1]; floats_imag[1] = out[4*i+1][1]; floats_imag[2] = out[4*i+2][1]; floats_imag[3] = out[4*i+3][1]; floats_imag_sqr = vmulq_f32(floats_imag, floats_imag); floats_add = vaddq_f32(floats_real_sqr,floats_imag_sqr); floats_sqroot[0] = sqrt(floats_add[0]); floats_sqroot[1] = sqrt(floats_add[1]); floats_sqroot[2] = sqrt(floats_add[2]); floats_sqroot[3] = sqrt(floats_add[3]); compares = vcageq_f32(floats_sqroot,thresholds); if (compares[0] == -1) { // Do not break loop threshold_exceeded = 1; // Save threshold data threshold_exceeded_mag = floats_sqroot[0]; threshold_exceeded_index = 4*i; break; } else if (compares[1] == -1) { // Do not break loop threshold_exceeded = 1; // Save threshold data threshold_exceeded_mag = floats_sqroot[1]; threshold_exceeded_index = 4*i+1; break; } else if (compares[2] == -1) { // Do not break loop threshold_exceeded = 1; // Save threshold data threshold_exceeded_mag = floats_sqroot[2]; threshold_exceeded_index = 4*i+2; break; } else if (compares[3] == -1) { // Do not break loop threshold_exceeded = 1; // Save threshold data threshold_exceeded_mag = floats_sqroot[3]; threshold_exceeded_index = 4*i+3; break; } } if (j > 10) { printf("TIMEOUT: Threshold never exceeded\n"); goto cleanup; } j++; sleep(1); } // Second, perform specturm sensing and the spectrum decision while (threshold_exceeded == 1) { threshold_exceeded = 0; crash_read(usrp_intf_rx, USRP_INTF_PLBLOCK_ID, number_samples); // Run FFT fftwf_execute(p1); for (i = 0; i < number_samples/4; i++) { // Calculate sqrt(I^2 + Q^2) floats_real[0] = out[4*i][0]; floats_real[1] = out[4*i+1][0]; floats_real[2] = out[4*i+2][0]; floats_real[3] = out[4*i+3][0]; floats_real_sqr = vmulq_f32(floats_real, floats_real); floats_imag[0] = out[4*i][1]; floats_imag[1] = out[4*i+1][1]; floats_imag[2] = out[4*i+2][1]; floats_imag[3] = out[4*i+3][1]; floats_imag_sqr = vmulq_f32(floats_imag, floats_imag); floats_add = vaddq_f32(floats_real_sqr,floats_imag_sqr); floats_sqroot[0] = sqrt(floats_add[0]); floats_sqroot[1] = sqrt(floats_add[1]); floats_sqroot[2] = sqrt(floats_add[2]); floats_sqroot[3] = sqrt(floats_add[3]); compares = vcageq_f32(floats_sqroot,thresholds); // Was the threshold exceeded? if (compares[0] == -1 || compares[1] == -1 || compares[2] == -1 || compares[3] == -1) { // Do not break loop threshold_exceeded = 1; break; } } if (threshold_exceeded == 0) { // Enable TX crash_set_bit(usrp_intf_tx->regs,USRP_TX_ENABLE); } } // Calculate how long the DMA and the thresholding took by using a counter in the FPGA // running at 150 MHz. start_dma = crash_read_reg(usrp_intf_tx->regs,DMA_DEBUG_CNT); crash_read(usrp_intf_rx, USRP_INTF_PLBLOCK_ID, number_samples); stop_dma = crash_read_reg(usrp_intf_tx->regs,DMA_DEBUG_CNT); // Set a huge threshold so we have to examine every bin thresholds[0] = 1000000000.0; thresholds[1] = 1000000000.0; thresholds[2] = 1000000000.0; thresholds[3] = 1000000000.0; start_sensing = crash_read_reg(usrp_intf_tx->regs,DMA_DEBUG_CNT); fftwf_execute(p1); for (i = 0; i < number_samples/4; i++) { floats_real[0] = out[4*i][0]; floats_real[1] = out[4*i+1][0]; floats_real[2] = out[4*i+2][0]; floats_real[3] = out[4*i+3][0]; floats_real_sqr = vmulq_f32(floats_real, floats_real); floats_imag[0] = out[4*i][1]; floats_imag[1] = out[4*i+1][1]; floats_imag[2] = out[4*i+2][1]; floats_imag[3] = out[4*i+3][1]; floats_imag_sqr = vmulq_f32(floats_imag, floats_imag); floats_add = vaddq_f32(floats_real_sqr,floats_imag_sqr); floats_sqroot[0] = sqrt(floats_add[0]); floats_sqroot[1] = sqrt(floats_add[1]); floats_sqroot[2] = sqrt(floats_add[2]); floats_sqroot[3] = sqrt(floats_add[3]); compares = vcageq_f32(floats_sqroot,thresholds); decisions[4*i] = compares[0]; decisions[4*i+1] = compares[1]; decisions[4*i+2] = compares[2]; decisions[4*i+3] = compares[3]; } stop_sensing = crash_read_reg(usrp_intf_tx->regs,DMA_DEBUG_CNT); start_decision = crash_read_reg(usrp_intf_tx->regs,DMA_DEBUG_CNT); for (i = 0; i < number_samples; i++) { if (decisions[i] == -1) { printf("This shouldn't happen\n"); } } stop_decision = crash_read_reg(usrp_intf_tx->regs,DMA_DEBUG_CNT); // Print threshold information printf("Threshold:\t\t\t%f\n",threshold); printf("Threshold Exceeded Index:\t%d\n",threshold_exceeded_index); printf("Threshold Exceeded Mag:\t\t%f\n",threshold_exceeded_mag); printf("DMA Time (us): %f\n",(1e6/150e6)*(stop_dma - start_dma)); printf("Sensing Time (us): %f\n",(1e6/150e6)*(stop_sensing - start_sensing)); printf("Decision Time (us): %f\n",(1e6/150e6)*(stop_decision - start_decision)); // Keep track of times so we can report an average at the end if (num_loops < 30) { dma_time[num_loops] = (1e6/150e6)*(stop_dma - start_dma); sensing_time[num_loops] = (1e6/150e6)*(stop_sensing - start_sensing); decision_time[num_loops] = (1e6/150e6)*(stop_decision - start_decision); } num_loops++; if (loop_prog == 1) { printf("Ctrl-C to end program after this loop\n"); } // Force printf to flush since. We are at a real-time priority, so it cannot unless we force it. fflush(stdout); //if (nanosleep(&ask_sleep,&act_sleep) < 0) { // perror("nanosleep"); // exit(EXIT_FAILURE); //} cleanup: crash_clear_bit(usrp_intf_tx->regs,USRP_RX_ENABLE); // Disable RX crash_clear_bit(usrp_intf_tx->regs,USRP_TX_ENABLE); // Disable TX threshold_exceeded = 0; threshold_exceeded_mag = 0.0; threshold_exceeded_index = 0; fftwf_destroy_plan(p1); sleep(1); } while (loop_prog == 1); float dma_time_avg = 0.0; float sensing_time_avg = 0.0; float decision_time_avg = 0.0; if (num_loops > 30) { for (i = 0; i < 30; i++) { dma_time_avg += dma_time[i]; sensing_time_avg += sensing_time[i]; decision_time_avg += decision_time[i]; } dma_time_avg = dma_time_avg/30; sensing_time_avg = sensing_time_avg/30; decision_time_avg = decision_time_avg/30; } else { for (i = 0; i < num_loops; i++) { dma_time_avg += dma_time[i]; sensing_time_avg += sensing_time[i]; decision_time_avg += decision_time[i]; } dma_time_avg = dma_time_avg/num_loops; sensing_time_avg = sensing_time_avg/num_loops; decision_time_avg = decision_time_avg/num_loops; } printf("Number of loops: %d\n",num_loops); printf("Average DMA time (us): %f\n",dma_time_avg); printf("Average Sensing time (us): %f\n",sensing_time_avg); printf("Average Decision time (us): %f\n",decision_time_avg); crash_close(usrp_intf_tx); crash_close(usrp_intf_rx); return 0; }
void __hv_biquad_f_win32(SignalBiquad *o, hv_bInf_t *_bIn, hv_bInf_t *_bX0, hv_bInf_t *_bX1, hv_bInf_t *_bX2, hv_bInf_t *_bY1, hv_bInf_t *_bY2, hv_bOutf_t bOut) { hv_bInf_t bIn = *_bIn; hv_bInf_t bX0 = *_bX0; hv_bInf_t bX1 = *_bX1; hv_bInf_t bX2 = *_bX2; hv_bInf_t bY1 = *_bY1; hv_bInf_t bY2 = *_bY2; #else void __hv_biquad_f(SignalBiquad *o, hv_bInf_t bIn, hv_bInf_t bX0, hv_bInf_t bX1, hv_bInf_t bX2, hv_bInf_t bY1, hv_bInf_t bY2, hv_bOutf_t bOut) { #endif #if HV_SIMD_AVX __m256 a = _mm256_mul_ps(bIn, bX0); __m256 b = _mm256_mul_ps(o->xm1, bX1); __m256 c = _mm256_mul_ps(o->xm2, bX2); __m256 d = _mm256_add_ps(a, b); __m256 e = _mm256_add_ps(c, d); // bIn*bX0 + o->x1*bX1 + o->x2*bX2 float y0 = e[0] - o->ym1*bY1[0] - o->ym2*bY2[0]; float y1 = e[1] - y0*bY1[1] - o->ym1*bY2[1]; float y2 = e[2] - y1*bY1[2] - y0*bY2[2]; float y3 = e[3] - y2*bY1[3] - y1*bY2[3]; float y4 = e[4] - y3*bY1[4] - y2*bY2[4]; float y5 = e[5] - y4*bY1[5] - y3*bY2[5]; float y6 = e[6] - y5*bY1[6] - y4*bY2[6]; float y7 = e[7] - y6*bY1[7] - y5*bY2[7]; o->xm2 = o->xm1; o->xm1 = bIn; o->ym1 = y7; o->ym2 = y6; *bOut = _mm256_set_ps(y7, y6, y5, y4, y3, y2, y1, y0); #elif HV_SIMD_SSE __m128 a = _mm_mul_ps(bIn, bX0); __m128 b = _mm_mul_ps(o->xm1, bX1); __m128 c = _mm_mul_ps(o->xm2, bX2); __m128 d = _mm_add_ps(a, b); __m128 e = _mm_add_ps(c, d); const float *const bbe = (float *) &e; const float *const bbY1 = (float *) &bY1; const float *const bbY2 = (float *) &bY2; float y0 = bbe[0] - o->ym1*bbY1[0] - o->ym2*bbY2[0]; float y1 = bbe[1] - y0*bbY1[1] - o->ym1*bbY2[1]; float y2 = bbe[2] - y1*bbY1[2] - y0*bbY2[2]; float y3 = bbe[3] - y2*bbY1[3] - y1*bbY2[3]; o->xm2 = o->xm1; o->xm1 = bIn; o->ym1 = y3; o->ym2 = y2; *bOut = _mm_set_ps(y3, y2, y1, y0); #elif HV_SIMD_NEON float32x4_t a = vmulq_f32(bIn, bX0); float32x4_t b = vmulq_f32(o->xm1, bX1); float32x4_t c = vmulq_f32(o->xm2, bX2); float32x4_t d = vaddq_f32(a, b); float32x4_t e = vaddq_f32(c, d); float y0 = e[0] - o->ym1*bY1[0] - o->ym2*bY2[0]; float y1 = e[1] - y0*bY1[1] - o->ym1*bY2[1]; float y2 = e[2] - y1*bY1[2] - y0*bY2[2]; float y3 = e[3] - y2*bY1[3] - y1*bY2[3]; o->xm2 = o->xm1; o->xm1 = bIn; o->ym1 = y3; o->ym2 = y2; *bOut = (float32x4_t) {y0, y1, y2, y3}; #else const float y = bIn*bX0 + o->xm1*bX1 + o->xm2*bX2 - o->ym1*bY1 - o->ym2*bY2; o->xm2 = o->xm1; o->xm1 = bIn; o->ym2 = o->ym1; o->ym1 = y; *bOut = y; #endif }
static void cftmdl_128_neon(float* a) { int j; const int l = 8; const float32x4_t vec_swap_sign = vld1q_f32((float32_t*)k_swap_sign); float32x4_t wk1rv = vld1q_f32(cftmdl_wk1r); for (j = 0; j < l; j += 2) { const float32x2_t a_00 = vld1_f32(&a[j + 0]); const float32x2_t a_08 = vld1_f32(&a[j + 8]); const float32x2_t a_32 = vld1_f32(&a[j + 32]); const float32x2_t a_40 = vld1_f32(&a[j + 40]); const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); const float32x2_t a_16 = vld1_f32(&a[j + 16]); const float32x2_t a_24 = vld1_f32(&a[j + 24]); const float32x2_t a_48 = vld1_f32(&a[j + 48]); const float32x2_t a_56 = vld1_f32(&a[j + 56]); const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); const float32x4_t xx0 = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); const float32x4_t x1_x3_add = vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x4_t x1_x3_sub = vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x2_t yy0_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 0); const float32x2_t yy0_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 0); const float32x4_t yy0_as = vcombine_f32(yy0_a, yy0_s); const float32x2_t yy1_a = vdup_lane_f32(vget_high_f32(x1_x3_add), 1); const float32x2_t yy1_s = vdup_lane_f32(vget_high_f32(x1_x3_sub), 1); const float32x4_t yy1_as = vcombine_f32(yy1_a, yy1_s); const float32x4_t yy0 = vmlaq_f32(yy0_as, vec_swap_sign, yy1_as); const float32x4_t yy4 = vmulq_f32(wk1rv, yy0); const float32x4_t xx1_rev = vrev64q_f32(xx1); const float32x4_t yy4_rev = vrev64q_f32(yy4); vst1_f32(&a[j + 0], vget_low_f32(xx0)); vst1_f32(&a[j + 32], vget_high_f32(xx0)); vst1_f32(&a[j + 16], vget_low_f32(xx1)); vst1_f32(&a[j + 48], vget_high_f32(xx1_rev)); a[j + 48] = -a[j + 48]; vst1_f32(&a[j + 8], vget_low_f32(x1_x3_add)); vst1_f32(&a[j + 24], vget_low_f32(x1_x3_sub)); vst1_f32(&a[j + 40], vget_low_f32(yy4)); vst1_f32(&a[j + 56], vget_high_f32(yy4_rev)); } { const int k = 64; const int k1 = 2; const int k2 = 2 * k1; const float32x4_t wk2rv = vld1q_f32(&rdft_wk2r[k2 + 0]); const float32x4_t wk2iv = vld1q_f32(&rdft_wk2i[k2 + 0]); const float32x4_t wk1iv = vld1q_f32(&rdft_wk1i[k2 + 0]); const float32x4_t wk3rv = vld1q_f32(&rdft_wk3r[k2 + 0]); const float32x4_t wk3iv = vld1q_f32(&rdft_wk3i[k2 + 0]); wk1rv = vld1q_f32(&rdft_wk1r[k2 + 0]); for (j = k; j < l + k; j += 2) { const float32x2_t a_00 = vld1_f32(&a[j + 0]); const float32x2_t a_08 = vld1_f32(&a[j + 8]); const float32x2_t a_32 = vld1_f32(&a[j + 32]); const float32x2_t a_40 = vld1_f32(&a[j + 40]); const float32x4_t a_00_32 = vcombine_f32(a_00, a_32); const float32x4_t a_08_40 = vcombine_f32(a_08, a_40); const float32x4_t x0r0_0i0_0r1_x0i1 = vaddq_f32(a_00_32, a_08_40); const float32x4_t x1r0_1i0_1r1_x1i1 = vsubq_f32(a_00_32, a_08_40); const float32x2_t a_16 = vld1_f32(&a[j + 16]); const float32x2_t a_24 = vld1_f32(&a[j + 24]); const float32x2_t a_48 = vld1_f32(&a[j + 48]); const float32x2_t a_56 = vld1_f32(&a[j + 56]); const float32x4_t a_16_48 = vcombine_f32(a_16, a_48); const float32x4_t a_24_56 = vcombine_f32(a_24, a_56); const float32x4_t x2r0_2i0_2r1_x2i1 = vaddq_f32(a_16_48, a_24_56); const float32x4_t x3r0_3i0_3r1_x3i1 = vsubq_f32(a_16_48, a_24_56); const float32x4_t xx = vaddq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t xx1 = vsubq_f32(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1); const float32x4_t x3i0_3r0_3i1_x3r1 = vrev64q_f32(x3r0_3i0_3r1_x3i1); const float32x4_t x1_x3_add = vmlaq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); const float32x4_t x1_x3_sub = vmlsq_f32(x1r0_1i0_1r1_x1i1, vec_swap_sign, x3i0_3r0_3i1_x3r1); float32x4_t xx4 = vmulq_f32(wk2rv, xx1); float32x4_t xx12 = vmulq_f32(wk1rv, x1_x3_add); float32x4_t xx22 = vmulq_f32(wk3rv, x1_x3_sub); xx4 = vmlaq_f32(xx4, wk2iv, vrev64q_f32(xx1)); xx12 = vmlaq_f32(xx12, wk1iv, vrev64q_f32(x1_x3_add)); xx22 = vmlaq_f32(xx22, wk3iv, vrev64q_f32(x1_x3_sub)); vst1_f32(&a[j + 0], vget_low_f32(xx)); vst1_f32(&a[j + 32], vget_high_f32(xx)); vst1_f32(&a[j + 16], vget_low_f32(xx4)); vst1_f32(&a[j + 48], vget_high_f32(xx4)); vst1_f32(&a[j + 8], vget_low_f32(xx12)); vst1_f32(&a[j + 40], vget_high_f32(xx12)); vst1_f32(&a[j + 24], vget_low_f32(xx22)); vst1_f32(&a[j + 56], vget_high_f32(xx22)); } } }