void test_vcvtQf32_u32 (void) { float32x4_t out_float32x4_t; uint32x4_t arg0_uint32x4_t; out_float32x4_t = vcvtq_f32_u32 (arg0_uint32x4_t); }
void word2float48_neon(const uint8_t *t8, const int pitch, float *p) { const uint16_t *t = (const uint16_t *)t8; for (int i = 0; i < 4; i++) { uint32x4_t u1 = vmovl_u16(vld1_u16(t)); uint32x4_t u2 = vmovl_u16(vld1_u16(t + 4)); uint32x4_t u3 = vmovl_u16(vld1_u16(t + 8)); float32x4_t f1 = vcvtq_f32_u32(u1); float32x4_t f2 = vcvtq_f32_u32(u2); float32x4_t f3 = vcvtq_f32_u32(u3); vst1q_f32(p, f1); vst1q_f32(p + 4, f2); vst1q_f32(p + 8, f3); t += pitch * 2; // it was already halved p += 12; } }
//Kernel function: saxpy void saxpy_vector(KernelArgs* args) { //Setup const float32x4_t MASK_FALSE = vdupq_n_f32(0.f); const float32x4_t MASK_TRUE = vcvtq_f32_u32(vceqq_f32(MASK_FALSE, MASK_FALSE)); //Uniforms //Fuses //Literals //Stack variables float32x4_t scale, x, y, result, var060, var061; //Loop over input uint64_t index; for(index = 0; index < args->N; index += 4) { //Inputs scale = vld1q_f32(&args->scale[index]); x = vld1q_f32(&args->x[index]); y = vld1q_f32(&args->y[index]); //Begin kernel logic { //>>> result = scale * x + y var061 = vmulq_f32(scale, x); var060 = vaddq_f32(var061, y); result = vbslq_f32(vcvtq_u32_f32(MASK_TRUE), var060, result); } //End kernel logic //Outputs vst1q_f32(&args->result[index], result); } }
void byte2float48_neon(const uint8_t *t, const int pitch, float *p) { uint16x8_t m0, m1, m2, m3, m4, m5; uint32x2_t temp1, temp4; m0 = vmovl_u8(vld1_u8(t)); temp1 = vld1_lane_u32((const uint32_t *)(t + 8), temp1, 0); temp1 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp1, 1); m1 = vmovl_u8(vreinterpret_u8_u32(temp1)); m2 = vmovl_u8(vld1_u8(t + pitch * 2 + 4)); t += pitch * 4; m3 = vmovl_u8(vld1_u8(t)); temp4 = vld1_lane_u32((const uint32_t *)(t + 8), temp4, 0); temp4 = vld1_lane_u32((const uint32_t *)(t + pitch * 2), temp4, 1); m4 = vmovl_u8(vreinterpret_u8_u32(temp4)); m5 = vmovl_u8(vld1_u8(t + pitch * 2 + 4)); vst1q_f32(p, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m0)))); vst1q_f32(p + 4, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m0)))); vst1q_f32(p + 8, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m1)))); vst1q_f32(p + 12, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m1)))); vst1q_f32(p + 16, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m2)))); vst1q_f32(p + 20, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m2)))); vst1q_f32(p + 24, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m3)))); vst1q_f32(p + 28, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m3)))); vst1q_f32(p + 32, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m4)))); vst1q_f32(p + 36, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m4)))); vst1q_f32(p + 40, vcvtq_f32_u32(vmovl_u16(vget_low_u16(m5)))); vst1q_f32(p + 44, vcvtq_f32_u32(vmovl_u16(vget_high_u16(m5)))); }
void meanStdDev(const Size2D &size, const u16 * srcBase, ptrdiff_t srcStride, f32 * pMean, f32 * pStdDev) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON size_t blockSize0 = 1 << 10, roiw4 = size.width & ~3; f64 fsum = 0.0f, fsqsum = 0.0f; f32 arsum[8]; uint32x4_t v_zero = vdupq_n_u32(0u), v_sum; float32x4_t v_zero_f = vdupq_n_f32(0.0f), v_sqsum; for (size_t i = 0; i < size.height; ++i) { const u16 * src = internal::getRowPtr(srcBase, srcStride, i); size_t j = 0u; while (j < roiw4) { size_t blockSize = std::min(roiw4 - j, blockSize0) + j; v_sum = v_zero; v_sqsum = v_zero_f; for ( ; j + 16 < blockSize ; j += 16) { internal::prefetch(src + j); uint16x8_t v_src0 = vld1q_u16(src + j), v_src1 = vld1q_u16(src + j + 8); // 0 uint32x4_t v_srclo = vmovl_u16(vget_low_u16(v_src0)); uint32x4_t v_srchi = vmovl_u16(vget_high_u16(v_src0)); v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); float32x4_t v_srclo_f = vcvtq_f32_u32(v_srclo); float32x4_t v_srchi_f = vcvtq_f32_u32(v_srchi); v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); // 1 v_srclo = vmovl_u16(vget_low_u16(v_src1)); v_srchi = vmovl_u16(vget_high_u16(v_src1)); v_sum = vaddq_u32(v_sum, vaddq_u32(v_srclo, v_srchi)); v_srclo_f = vcvtq_f32_u32(v_srclo); v_srchi_f = vcvtq_f32_u32(v_srchi); v_sqsum = vmlaq_f32(v_sqsum, v_srclo_f, v_srclo_f); v_sqsum = vmlaq_f32(v_sqsum, v_srchi_f, v_srchi_f); } for ( ; j < blockSize; j += 4) { uint32x4_t v_src = vmovl_u16(vld1_u16(src + j)); float32x4_t v_src_f = vcvtq_f32_u32(v_src); v_sum = vaddq_u32(v_sum, v_src); v_sqsum = vmlaq_f32(v_sqsum, v_src_f, v_src_f); } vst1q_f32(arsum, vcvtq_f32_u32(v_sum)); vst1q_f32(arsum + 4, v_sqsum); fsum += (f64)arsum[0] + arsum[1] + arsum[2] + arsum[3]; fsqsum += (f64)arsum[4] + arsum[5] + arsum[6] + arsum[7]; } // collect a few last elements in the current row for ( ; j < size.width; ++j) { f32 srcval = src[j]; fsum += srcval; fsqsum += srcval * srcval; } } // calc mean and stddev f64 itotal = 1.0 / size.total(); f64 mean = fsum * itotal; f64 stddev = sqrt(std::max(fsqsum * itotal - mean * mean, 0.0)); if (pMean) *pMean = mean; if (pStdDev) *pStdDev = stddev; #else (void)size; (void)srcBase; (void)srcStride; (void)pMean; (void)pStdDev; #endif }