template <bool align> SIMD_INLINE void HogDirectionHistograms(const float32x4_t & dx, const float32x4_t & dy, Buffer & buffer, size_t col) { float32x4_t bestDot = vdupq_n_f32(0); int32x4_t bestIndex = vdupq_n_s32(0); for(int i = 0; i < buffer.size; ++i) { float32x4_t dot = vaddq_f32(vmulq_f32(dx, buffer.cos[i]), vmulq_f32(dy, buffer.sin[i])); uint32x4_t mask = vcgtq_f32(dot, bestDot); bestDot = vmaxq_f32(dot, bestDot); bestIndex = vbslq_s32(mask, buffer.pos[i], bestIndex); dot = vnegq_f32(dot); mask = vcgtq_f32(dot, bestDot); bestDot = vmaxq_f32(dot, bestDot); bestIndex = vbslq_s32(mask, buffer.neg[i], bestIndex); } Store<align>(buffer.index + col, bestIndex); Store<align>(buffer.value + col, Sqrt<SIMD_NEON_RCP_ITER>(vaddq_f32(vmulq_f32(dx, dx), vmulq_f32(dy, dy)))); }
KFR_SINTRIN i32neon select(const maskfor<i32neon>& m, const i32neon& x, const i32neon& y) { return vbslq_s32(*m, *x, *y); }