コード例 #1
0
ファイル: vmaxf32.c プロジェクト: AlexMioMio/gcc
void test_vmaxf32 (void)
{
  float32x2_t out_float32x2_t;
  float32x2_t arg0_float32x2_t;
  float32x2_t arg1_float32x2_t;

  out_float32x2_t = vmax_f32 (arg0_float32x2_t, arg1_float32x2_t);
}
コード例 #2
0
ファイル: Math.cpp プロジェクト: janivanecky/Bark
float Math::Max(float a, float b)
{
    __n64 nA;
    nA.n64_f32[0] = a;
    __n64 nB;
    nB.n64_f32[0] = b;
    __n64 result = vmax_f32(nA, nB);
    return result.n64_f32[0];
}
コード例 #3
0
ファイル: arm64_vMaxMin.c プロジェクト: 4ntoine/clang
float32x2_t test_vmax_f32(float32x2_t a1, float32x2_t a2) {
  // CHECK: test_vmax_f32
  return vmax_f32(a1, a2);
  // CHECK llvm.aarch64.neon.fmax.v2f32
}
コード例 #4
0
void computeNetwork0_i16_neon(const float *inputf, const float *weightsf, uint8_t *d) {
    const int16_t *input = (const int16_t *)inputf;
    const int16_t *weights = (const int16_t *)weightsf;

    int32x4_t accum0 = { 0, 0, 0, 0 };
    int32x4_t accum1 = accum0;
    int32x4_t accum2 = accum0;
    int32x4_t accum3 = accum0;

    for (int i = 0; i < 96/2; i += 8) {
        int16x4x2_t d0 = vld2_s16(input + i);

        int16x4x2_t w0 = vld2_s16(weights + i * 4);
        int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8);
        int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16);
        int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24);

        accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]);
        accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]);

        accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]);
        accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]);

        accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]);
        accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]);

        accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]);
        accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]);
    }

    int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0));
    int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1));
    int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
    int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3));
    sum0 = vpadd_s32(sum0, sum1);
    sum1 = vpadd_s32(sum2, sum3);
    int32x4_t sum = vcombine_s32(sum0, sum1);

    float32x4_t m0 = vcvtq_f32_s32(sum);

    m0 = vmulq_f32(m0, vld1q_f32(weightsf + 384/4));
    m0 = vaddq_f32(m0, vld1q_f32(weightsf + 400/4));

    float32x4_t m1, m2, m3, m4, m5, m6, m7;

    m1 = m0;

    m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l));
    m0 = vaddq_f32(m0, ones_f);
    m0 = vmulq_f32(reciprocal(m0), m1);

    m1 = vdupq_lane_f32(vget_low_f32(m0), 0);
    m2 = vdupq_lane_f32(vget_low_f32(m0), 1);
    m3 = vdupq_lane_f32(vget_high_f32(m0), 0);
    m4 = vdupq_lane_f32(vget_high_f32(m0), 1);

    m1 = vmulq_f32(m1, vld1q_f32(weightsf + 416/4));
    m2 = vmulq_f32(m2, vld1q_f32(weightsf + (416+16)/4));
    m3 = vmulq_f32(m3, vld1q_f32(weightsf + (416+32)/4));
    m4 = vmulq_f32(m4, vld1q_f32(weightsf + (416+48)/4));

    m1 = vaddq_f32(m1, m2);
    m3 = vaddq_f32(m3, m4);
    m1 = vaddq_f32(m1, m3);
    m1 = vaddq_f32(m1, vld1q_f32(weightsf + (416+64)/4));

    m7 = m1;
    m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f));
    m1 = vaddq_f32(m1, ones_f);
    m7 = vmulq_f32(reciprocal(m1), m7);

    m3 = m0;

    m0 = vdupq_lane_f32(vget_low_f32(m0), 0);
    m1 = vdupq_lane_f32(vget_low_f32(m3), 1);
    m2 = vdupq_lane_f32(vget_high_f32(m3), 0);
    m3 = vdupq_lane_f32(vget_high_f32(m3), 1);

    m0 = vmulq_f32(m0, vld1q_f32(weightsf + 496/4));
    m1 = vmulq_f32(m1, vld1q_f32(weightsf + (496+16)/4));
    m2 = vmulq_f32(m2, vld1q_f32(weightsf + (496+32)/4));
    m3 = vmulq_f32(m3, vld1q_f32(weightsf + (496+48)/4));

    m4 = vdupq_lane_f32(vget_low_f32(m7), 0);
    m5 = vdupq_lane_f32(vget_low_f32(m7), 1);
    m6 = vdupq_lane_f32(vget_high_f32(m7), 0);
    m7 = vdupq_lane_f32(vget_high_f32(m7), 1);

    m4 = vmulq_f32(m4, vld1q_f32(weightsf + (496+64)/4));
    m5 = vmulq_f32(m5, vld1q_f32(weightsf + (496+80)/4));
    m6 = vmulq_f32(m6, vld1q_f32(weightsf + (496+96)/4));
    m7 = vmulq_f32(m7, vld1q_f32(weightsf + (496+112)/4));

    m0 = vaddq_f32(m0, m1);
    m2 = vaddq_f32(m2, m3);
    m4 = vaddq_f32(m4, m5);
    m6 = vaddq_f32(m6, m7);

    m0 = vaddq_f32(m0, m2);
    m4 = vaddq_f32(m4, m6);
    m0 = vaddq_f32(m0, m4);

    m0 = vaddq_f32(m0, vld1q_f32(weightsf + (496+128)/4));

    float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0));
    d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0));
}
コード例 #5
0
void computeNetwork0_neon(const float *input, const float *weights, uint8_t *d) {
    float32x4_t m0 = { 0.0f, 0.0f, 0.0f, 0.0f };
    float32x4_t m1 = m0;
    float32x4_t m2 = m0;
    float32x4_t m3 = m0;

    float32x4_t m4, m5, m6, m7;

    for (int i = 0; i < 192/4; i += 4) {
        m4 = vld1q_f32(input + i);
        m5 = m4;
        m6 = m4;
        m7 = m4;

        m4 = vmulq_f32(m4, vld1q_f32(weights + i * 4));
        m5 = vmulq_f32(m5, vld1q_f32(weights + i * 4 + 4));
        m6 = vmulq_f32(m6, vld1q_f32(weights + i * 4 + 8));
        m7 = vmulq_f32(m7, vld1q_f32(weights + i * 4 + 12));

        m0 = vaddq_f32(m0, m4);
        m1 = vaddq_f32(m1, m5);
        m2 = vaddq_f32(m2, m6);
        m3 = vaddq_f32(m3, m7);
    }

    float32x2_t sum0 = vpadd_f32(vget_low_f32(m0), vget_high_f32(m0));
    float32x2_t sum1 = vpadd_f32(vget_low_f32(m1), vget_high_f32(m1));
    float32x2_t sum2 = vpadd_f32(vget_low_f32(m2), vget_high_f32(m2));
    float32x2_t sum3 = vpadd_f32(vget_low_f32(m3), vget_high_f32(m3));
    sum0 = vpadd_f32(sum0, sum1);
    sum1 = vpadd_f32(sum2, sum3);
    m0 = vcombine_f32(sum0, sum1);

    m0 = vaddq_f32(m0, vld1q_f32(weights + 768/4));

    m1 = m0;
    m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l));
    m0 = vaddq_f32(m0, ones_f);
    m0 = vmulq_f32(reciprocal(m0), m1);

    m1 = vdupq_lane_f32(vget_low_f32(m0), 0);
    m2 = vdupq_lane_f32(vget_low_f32(m0), 1);
    m3 = vdupq_lane_f32(vget_high_f32(m0), 0);
    m4 = vdupq_lane_f32(vget_high_f32(m0), 1);

    m1 = vmulq_f32(m1, vld1q_f32(weights + 784/4));
    m2 = vmulq_f32(m2, vld1q_f32(weights + (784+16)/4));
    m3 = vmulq_f32(m3, vld1q_f32(weights + (784+32)/4));
    m4 = vmulq_f32(m4, vld1q_f32(weights + (784+48)/4));

    m1 = vaddq_f32(m1, m2);
    m3 = vaddq_f32(m3, m4);
    m1 = vaddq_f32(m1, m3);
    m1 = vaddq_f32(m1, vld1q_f32(weights + (784+64)/4));

    m7 = m1;
    m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f));
    m1 = vaddq_f32(m1, ones_f);
    m7 = vmulq_f32(reciprocal(m1), m7);

    m3 = m0;

    m0 = vdupq_lane_f32(vget_low_f32(m0), 0);
    m1 = vdupq_lane_f32(vget_low_f32(m3), 1);
    m2 = vdupq_lane_f32(vget_high_f32(m3), 0);
    m3 = vdupq_lane_f32(vget_high_f32(m3), 1);

    m0 = vmulq_f32(m0, vld1q_f32(weights + 864/4));
    m1 = vmulq_f32(m1, vld1q_f32(weights + (864+16)/4));
    m2 = vmulq_f32(m2, vld1q_f32(weights + (864+32)/4));
    m3 = vmulq_f32(m3, vld1q_f32(weights + (864+48)/4));

    m4 = vdupq_lane_f32(vget_low_f32(m7), 0);
    m5 = vdupq_lane_f32(vget_low_f32(m7), 1);
    m6 = vdupq_lane_f32(vget_high_f32(m7), 0);
    m7 = vdupq_lane_f32(vget_high_f32(m7), 1);

    m4 = vmulq_f32(m4, vld1q_f32(weights + (864+64)/4));
    m5 = vmulq_f32(m5, vld1q_f32(weights + (864+80)/4));
    m6 = vmulq_f32(m6, vld1q_f32(weights + (864+96)/4));
    m7 = vmulq_f32(m7, vld1q_f32(weights + (864+112)/4));

    m0 = vaddq_f32(m0, m1);
    m2 = vaddq_f32(m2, m3);
    m4 = vaddq_f32(m4, m5);
    m6 = vaddq_f32(m6, m7);

    m0 = vaddq_f32(m0, m2);
    m4 = vaddq_f32(m4, m6);
    m0 = vaddq_f32(m0, m4);

    m0 = vaddq_f32(m0, vld1q_f32(weights + (864+128)/4));

    float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0));
    d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0));
}
コード例 #6
0
ファイル: vtransform.hpp プロジェクト: 007Indian/opencv
inline float32x2_t vmax(const float32x2_t & v0, const float32x2_t & v1) { return vmax_f32(v0, v1); }
コード例 #7
0
float32x2_t
test_vmax_f32 (float32x2_t __a, float32x2_t __b)
{
  return vmax_f32(__a, __b);
}