inline float32x2_t cv_vrsqrt_f32(float32x2_t val) { float32x2_t e = vrsqrte_f32(val); e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e); return e; }
inline float32x2_t cv_vrecp_f32(float32x2_t val) { float32x2_t reciprocal = vrecpe_f32(val); reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal); return reciprocal; }
f64 dotProduct(const Size2D &_size, const f32 * src0Base, ptrdiff_t src0Stride, const f32 * src1Base, ptrdiff_t src1Stride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON Size2D size(_size); if (src0Stride == src1Stride && src0Stride == (ptrdiff_t)(size.width * sizeof(f32))) { size.width *= size.height; size.height = 1; } #define DOT_FLOAT_BLOCKSIZE (1 << 13) f64 result = 0.0; for (size_t row = 0; row < size.height; ++row) { const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row); const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row); size_t i = 0; while(i + 4 <= size.width) { size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4; float32x4_t v_sum = vdupq_n_f32(0.0f); for( ; i <= lim; i += 4 ) { internal::prefetch(src0 + i); internal::prefetch(src1 + i); v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i)); } float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); } if(i + 2 <= size.width) { float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i)); result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1); i += 2; } for (; i < size.width; ++i) result += src0[i] * src1[i]; } return result; #else (void)_size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; return 0; #endif }
void test_vmulf32 (void) { float32x2_t out_float32x2_t; float32x2_t arg0_float32x2_t; float32x2_t arg1_float32x2_t; out_float32x2_t = vmul_f32 (arg0_float32x2_t, arg1_float32x2_t); }
void dot_loop(const GLfloat *verts, const GLfloat *params, GLfloat *out, GLint count) { #ifdef __ARM_NEON__ float32x2_t acc; float32x2x3_t vert; float32x2x3_t param = vld3_f32((const float32_t *)params); for (; count != 0; count -= 1) { vert = vld3_f32((const float32_t *)verts); acc = vmul_f32(vert.val[0], param.val[0]); acc = vmla_f32(acc, vert.val[1], param.val[1]); acc = vmla_f32(acc, vert.val[2], param.val[2]); vst1_f32((float32_t *)out, acc); out += 2; verts += 3; } #else for (int i = 0; i < count; i++) { out[0] = dot(verts, params); out += 2; verts += 3; } #endif }