void test_vcvtQu32_f32 (void) { uint32x4_t out_uint32x4_t; float32x4_t arg0_float32x4_t; out_uint32x4_t = vcvtq_u32_f32 (arg0_float32x4_t); }
//Kernel function: saxpy void saxpy_vector(KernelArgs* args) { //Setup const float32x4_t MASK_FALSE = vdupq_n_f32(0.f); const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE)); //Uniforms //Fuses //Literals //Stack variables float32x4_t scale, x, y, result, var060, var061; //Loop over input uint64_t index; for(index = 0; index < args->N; index += 4) { //Inputs scale = vld1q_f32(&args->scale[index]); x = vld1q_f32(&args->x[index]); y = vld1q_f32(&args->y[index]); //Begin kernel logic { //>>> result = scale * x + y var061 = vmulq_f32(scale, x); var060 = vaddq_f32(var061, y); result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result); } //End kernel logic //Outputs vst1q_f32(&args->result[index], result); } }
inline uint32x4_t cv_vrndq_u32_f32(float32x4_t v) { static float32x4_t v_05 = vdupq_n_f32(0.5f); return vcvtq_u32_f32(vaddq_f32(v, v_05)); }
void phase(const Size2D &size, const s16 * src0Base, ptrdiff_t src0Stride, const s16 * src1Base, ptrdiff_t src1Stride, u8 * dstBase, ptrdiff_t dstStride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON FASTATAN2CONST(256.0f / 360.0f) size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; float32x4_t v_05 = vdupq_n_f32(0.5f); for (size_t i = 0; i < size.height; ++i) { const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i); const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); u8 * dst = internal::getRowPtr(dstBase, dstStride, i); size_t j = 0; for (; j < roiw16; j += 16) { internal::prefetch(src0 + j); internal::prefetch(src1 + j); int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8); int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); // 0 float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); float32x4_t v_dst32f0; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); float32x4_t v_dst32f1; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); // 1 v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11))); FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11))); FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0), vmovn_u16(v_dst16s1))); } for (; j < roiw8; j += 8) { int16x8_t v_src0 = vld1q_s16(src0 + j); int16x8_t v_src1 = vld1q_s16(src1 + j); float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0))); float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))); float32x4_t v_dst32f0; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))); float32x4_t v_dst32f1; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); vst1_u8(dst + j, vmovn_u16(v_dst)); } for (; j < size.width; j++) { f32 x = src0[j], y = src1[j]; f32 a; FASTATAN2SCALAR(y, x, a) dst[j] = (u8)(s32)floor(a + 0.5f); } } #else (void)size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; (void)dstBase; (void)dstStride; #endif }