void test_vceqQf32 (void)
{
  uint32x4_t out_uint32x4_t;
  float32x4_t arg0_float32x4_t;
  float32x4_t arg1_float32x4_t;

  out_uint32x4_t = vceqq_f32 (arg0_float32x4_t, arg1_float32x4_t);
}
void dumb() {
  V4SF x = {{ 0.0903333798051, 0.0903333798051, 0.0903333798051, 0.0903333798051 }};
  V4SF w; w.v = log_ps(x.v);
  float z = cephes_logf(x.f[0]);
  printf("log_ps returned "); print4(w.v);
  printf("\ncephes returned: %14.12g and logf(%g)=%14.12g\n", z, x.f[0], logf(x.f[0]));

  print4i(vceqq_f32(x.v, x.v)); printf("\n");
  exit(1);
}
static inline float32x4_t floor_neon(float32x4_t a)
{
#if __ARM_ARCH >= 8
   return vrndqm_f32(a);
#else
   const float32x4_t round32 = vdupq_n_f32(12582912.0f);
   const float32x4_t vhalf = vdupq_n_f32(0.5f);

   float32x4_t rounded = vsubq_f32(vaddq_f32(a, round32), round32);
   uint32x4_t mask = vceqq_f32(a, rounded);

   float32x4_t floored = vsubq_f32(vaddq_f32(vsubq_f32(a, vhalf), round32), round32);
   return vreinterpretq_f32_u32(vorrq_u32(vandq_u32(vreinterpretq_u32_f32(a), mask),
            vbicq_u32(vreinterpretq_u32_f32(floored), mask)));
#endif
}
//Kernel function: saxpy
void saxpy_vector(KernelArgs* args) {

    //Setup
    const float32x4_t MASK_FALSE = vdupq_n_f32(0.f);
    const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE));
    
    //Uniforms
    
    //Fuses
    
    //Literals
    
    //Stack variables
    float32x4_t scale, x, y, result, var060, var061;
    
    //Loop over input
    uint64_t index;
    for(index = 0; index < args->N; index += 4) {
    
        //Inputs
        scale = vld1q_f32(&args->scale[index]);
        x = vld1q_f32(&args->x[index]);
        y = vld1q_f32(&args->y[index]);
        
        //Begin kernel logic
        {
        
            //>>> result = scale * x + y
            var061 = vmulq_f32(scale, x);
            var060 = vaddq_f32(var061, y);
            result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result);
        
        }
        //End kernel logic
        
        //Outputs
        vst1q_f32(&args->result[index], result);
        
    }
}
Exemple #5
0
inline  uint32x4_t vceqq(const float32x4_t & v0, const float32x4_t & v1) { return vceqq_f32(v0, v1); }