示例#1
4
inline float32x2_t cv_vrsqrt_f32(float32x2_t val)
{
    float32x2_t e = vrsqrte_f32(val);
    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
    e = vmul_f32(vrsqrts_f32(vmul_f32(e, e), val), e);
    return e;
}
示例#2
0
inline float32x2_t cv_vrecp_f32(float32x2_t val)
{
    float32x2_t reciprocal = vrecpe_f32(val);
    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
    reciprocal = vmul_f32(vrecps_f32(val, reciprocal), reciprocal);
    return reciprocal;
}
示例#3
0
f64 dotProduct(const Size2D &_size,
               const f32 * src0Base, ptrdiff_t src0Stride,
               const f32 * src1Base, ptrdiff_t src1Stride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
    Size2D size(_size);
    if (src0Stride == src1Stride &&
        src0Stride == (ptrdiff_t)(size.width * sizeof(f32)))
    {
        size.width *= size.height;
        size.height = 1;
    }

#define DOT_FLOAT_BLOCKSIZE (1 << 13)
    f64 result = 0.0;
    for (size_t row = 0; row < size.height; ++row)
    {
        const f32 * src0 = internal::getRowPtr(src0Base, src0Stride, row);
        const f32 * src1 = internal::getRowPtr(src1Base, src1Stride, row);

        size_t i = 0;
        while(i + 4 <= size.width)
        {
            size_t lim = std::min(i + DOT_FLOAT_BLOCKSIZE, size.width) - 4;
            float32x4_t v_sum = vdupq_n_f32(0.0f);

            for( ; i <= lim; i += 4 )
            {
                internal::prefetch(src0 + i);
                internal::prefetch(src1 + i);
                v_sum = vmlaq_f32(v_sum, vld1q_f32(src0 + i), vld1q_f32(src1 + i));
            }

            float32x2_t vres = vpadd_f32(vget_low_f32(v_sum),vget_high_f32(v_sum));
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
        }

        if(i + 2 <= size.width)
        {
            float32x2_t vres = vmul_f32(vld1_f32(src0 + i), vld1_f32(src1 + i));
            result += vget_lane_f32(vres, 0) + vget_lane_f32(vres, 1);
            i += 2;
        }

        for (; i < size.width; ++i)
            result += src0[i] * src1[i];
    }
    return result;
#else
    (void)_size;
    (void)src0Base;
    (void)src0Stride;
    (void)src1Base;
    (void)src1Stride;

    return 0;
#endif
}
示例#4
0
文件: vmulf32.c 项目: 0day-ci/gcc
void test_vmulf32 (void)
{
  float32x2_t out_float32x2_t;
  float32x2_t arg0_float32x2_t;
  float32x2_t arg1_float32x2_t;

  out_float32x2_t = vmul_f32 (arg0_float32x2_t, arg1_float32x2_t);
}
示例#5
0
文件: texgen.c 项目: AluOne/OpenCPN
void dot_loop(const GLfloat *verts, const GLfloat *params, GLfloat *out, GLint count) {
#ifdef __ARM_NEON__
    float32x2_t acc;
    float32x2x3_t vert;
    float32x2x3_t param = vld3_f32((const float32_t *)params);
    for (; count != 0; count -= 1) {
        vert = vld3_f32((const float32_t *)verts);
        acc = vmul_f32(vert.val[0], param.val[0]);
        acc = vmla_f32(acc, vert.val[1], param.val[1]);
        acc = vmla_f32(acc, vert.val[2], param.val[2]);
        vst1_f32((float32_t *)out, acc);

        out += 2;
        verts += 3;
    }
#else
    for (int i = 0; i < count; i++) {
        out[0] = dot(verts, params);
        out += 2;
        verts += 3;
    }
#endif
}