void test_vceqQf32 (void) { uint32x4_t out_uint32x4_t; float32x4_t arg0_float32x4_t; float32x4_t arg1_float32x4_t; out_uint32x4_t = vceqq_f32 (arg0_float32x4_t, arg1_float32x4_t); }
void dumb() { V4SF x = {{ 0.0903333798051, 0.0903333798051, 0.0903333798051, 0.0903333798051 }}; V4SF w; w.v = log_ps(x.v); float z = cephes_logf(x.f[0]); printf("log_ps returned "); print4(w.v); printf("\ncephes returned: %14.12g and logf(%g)=%14.12g\n", z, x.f[0], logf(x.f[0])); print4i(vceqq_f32(x.v, x.v)); printf("\n"); exit(1); }
static inline float32x4_t floor_neon(float32x4_t a) { #if __ARM_ARCH >= 8 return vrndqm_f32(a); #else const float32x4_t round32 = vdupq_n_f32(12582912.0f); const float32x4_t vhalf = vdupq_n_f32(0.5f); float32x4_t rounded = vsubq_f32(vaddq_f32(a, round32), round32); uint32x4_t mask = vceqq_f32(a, rounded); float32x4_t floored = vsubq_f32(vaddq_f32(vsubq_f32(a, vhalf), round32), round32); return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(a), mask), vbicq_u32(vreinterpretq_u32_f32(floored), mask))); #endif }
//Kernel function: saxpy void saxpy_vector(KernelArgs* args) { //Setup const float32x4_t MASK_FALSE = vdupq_n_f32(0.f); const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE)); //Uniforms //Fuses //Literals //Stack variables float32x4_t scale, x, y, result, var060, var061; //Loop over input uint64_t index; for(index = 0; index < args->N; index += 4) { //Inputs scale = vld1q_f32(&args->scale[index]); x = vld1q_f32(&args->x[index]); y = vld1q_f32(&args->y[index]); //Begin kernel logic { //>>> result = scale * x + y var061 = vmulq_f32(scale, x); var060 = vaddq_f32(var061, y); result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result); } //End kernel logic //Outputs vst1q_f32(&args->result[index], result); } }
inline uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); }