void dotProd_i16_neon(const float *dataf, const float *weightsf, float *vals, const int n, const int len, const float *istd) {
    const int16_t *data = (const int16_t *)dataf;
    const int16_t *weights = (const int16_t *)weightsf;
    weightsf += n * len / 2; // sizeof(float) / sizeof(int16_t)

    for (int i = 0; i < n; i += 4) {
        int32x4_t accum0 = { 0, 0, 0, 0 };
        int32x4_t accum1 = accum0;
        int32x4_t accum2 = accum0;
        int32x4_t accum3 = accum0;

        for (int j = 0; j < len; j += 8) {
            int16x4x2_t d0 = vld2_s16(data + j);

            int16x4x2_t w0 = vld2_s16(weights);
            int16x4x2_t w1 = vld2_s16(weights + 8);
            int16x4x2_t w2 = vld2_s16(weights + 16);
            int16x4x2_t w3 = vld2_s16(weights + 24);

            accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]);
            accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]);

            accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]);
            accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]);

            accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]);
            accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]);

            accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]);
            accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]);

            weights += 32;
        }

        int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0));
        int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1));
        int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
        int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3));
        sum0 = vpadd_s32(sum0, sum1);
        sum1 = vpadd_s32(sum2, sum3);
        int32x4_t sum = vcombine_s32(sum0, sum1);

        float32x4_t val = vcvtq_f32_s32(sum);
        val = vmulq_f32(val, vld1q_f32(weightsf + i*2));
        val = vmulq_n_f32(val, istd[0]);
        val = vaddq_f32(val, vld1q_f32(weightsf + i*2 + 4));
        vst1q_f32(vals + i, val);
    }
}
Example #2
0
void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count,
        const int* xofs, const short* alpha,
        int swidth, int dwidth, int cn, int xmin, int xmax)
{
    int dx, k;
    int dx0 = 0;

    int16x4x2_t alpha_vec;

    uint8x8_t dS0_vec, dS1_vec;
    int16x8_t qS0_vec, qS1_vec;
    int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567;

    int32x4_t qT0_vec, qT1_vec;

    int16x4_t dCoeff;
    dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE);

    for (k = 0; k <= count - 2; k++)
    {
        const unsigned char *S0 = src[k], *S1 = src[k + 1];
        int *D0 = dst[k], *D1 = dst[k + 1];

        for (dx = dx0; dx < xmax; dx += 4)
        {
            int sx = xofs[dx];

            alpha_vec = vld2_s16 (&alpha[dx * 2]);

            dS0_vec = vld1_u8 (&S0[sx]);
            dS1_vec = vld1_u8 (&S1[sx]);

            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS0_4567 = vget_high_s16 (qS0_vec);
            dS1_0123 = vget_low_s16 (qS1_vec);
            dS1_4567 = vget_high_s16 (qS1_vec);

            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
            qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]);
            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
            qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]);

            vst1q_s32 (&D0[dx], qT0_vec);
            vst1q_s32 (&D1[dx], qT1_vec);
        }

        for (; dx < dwidth; dx += 4)
        {
            int sx = xofs[dx];

            dS0_vec = vld1_u8 (&S0[sx]);
            dS1_vec = vld1_u8 (&S1[sx]);

            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS1_0123 = vget_low_s16 (qS1_vec);

            qT0_vec = vmull_s16 (dS0_0123, dCoeff);
            qT1_vec = vmull_s16 (dS1_0123, dCoeff);

            vst1q_s32 (&D0[dx], qT0_vec);
            vst1q_s32 (&D1[dx], qT1_vec);
        }
    }

    for (; k < count; k++)
    {
        const unsigned char *S = src[k];
        int *D = dst[k];
        for (dx = 0; dx < xmax; dx += 4)
        {
            int sx = xofs[dx];

            alpha_vec = vld2_s16 (&alpha[dx * 2]);

            dS0_vec = vld1_u8 (&S[sx]);
            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS0_4567 = vget_high_s16 (qS0_vec);

            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);

            vst1q_s32 (&D[dx], qT0_vec);
        }

        for (; dx < dwidth; dx += 4)
        {
            int sx = xofs[dx];

            dS0_vec = vld1_u8 (&S[sx]);
            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            dS0_0123 = vget_low_s16 (qS0_vec);
            qT0_vec = vmull_s16 (dS0_0123, dCoeff);

            vst1q_s32 (&D[dx], qT0_vec);
        }
    }
}
void computeNetwork0new_neon(const float *dataf, const float *weightsf, uint8_t *d) {
    const int16_t *data = (const int16_t *)dataf;
    const int16_t *weights = (const int16_t *)weightsf;

    int32x4_t accum0 = { 0, 0, 0, 0 };
    int32x4_t accum1 = accum0;
    int32x4_t accum2 = accum0;
    int32x4_t accum3 = accum0;

    for (int i = 0; i < 128/2; i += 8) {
        int16x4x2_t d0 = vld2_s16(data + i);

        int16x4x2_t w0 = vld2_s16(weights + i * 4);
        int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8);
        int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16);
        int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24);

        accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]);
        accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]);

        accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]);
        accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]);

        accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]);
        accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]);

        accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]);
        accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]);
    }

    int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0));
    int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1));
    int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
    int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3));
    sum0 = vpadd_s32(sum0, sum1);
    sum1 = vpadd_s32(sum2, sum3);
    int32x4_t sum = vcombine_s32(sum0, sum1);

    float32x4_t m0 = vcvtq_f32_s32(sum);

    m0 = vmulq_f32(m0, vld1q_f32(weightsf + 512/4));
    m0 = vaddq_f32(m0, vld1q_f32(weightsf + 528/4));

    float32x4_t m1, m2, m3, m4;

    m1 = m0;

    m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f));
    m0 = vaddq_f32(m0, ones_f);
    m0 = vmulq_f32(reciprocal(m0), m1);

    m1 = vdupq_lane_f32(vget_low_f32(m0), 0);
    m2 = vdupq_lane_f32(vget_low_f32(m0), 1);
    m3 = vdupq_lane_f32(vget_high_f32(m0), 0);
    m4 = vdupq_lane_f32(vget_high_f32(m0), 1);

    m1 = vmulq_f32(m1, vld1q_f32(weightsf + 544/4));
    m2 = vmulq_f32(m2, vld1q_f32(weightsf + 560/4));
    m3 = vmulq_f32(m3, vld1q_f32(weightsf + 576/4));
    m4 = vmulq_f32(m4, vld1q_f32(weightsf + 592/4));

    m1 = vaddq_f32(m1, m2);
    m3 = vaddq_f32(m3, m4);
    m1 = vaddq_f32(m1, m3);
    m1 = vaddq_f32(m1, vld1q_f32(weightsf + 608/4));

    uint32x4_t gte = vcgeq_f32(m1, zeroes_f);
    uint16x4_t gte_u16 = vmovn_u32(gte);
    uint8x8_t gte_u8 = vmovn_u16(vcombine_u16(gte_u16, vget_low_u16(vreinterpretq_u16_u32(sign_bits_f))));
    gte_u8 = vshr_n_u8(gte_u8, 7);
    vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(gte_u8), 0);
}
void computeNetwork0_i16_neon(const float *inputf, const float *weightsf, uint8_t *d) {
    const int16_t *input = (const int16_t *)inputf;
    const int16_t *weights = (const int16_t *)weightsf;

    int32x4_t accum0 = { 0, 0, 0, 0 };
    int32x4_t accum1 = accum0;
    int32x4_t accum2 = accum0;
    int32x4_t accum3 = accum0;

    for (int i = 0; i < 96/2; i += 8) {
        int16x4x2_t d0 = vld2_s16(input + i);

        int16x4x2_t w0 = vld2_s16(weights + i * 4);
        int16x4x2_t w1 = vld2_s16(weights + i * 4 + 8);
        int16x4x2_t w2 = vld2_s16(weights + i * 4 + 16);
        int16x4x2_t w3 = vld2_s16(weights + i * 4 + 24);

        accum0 = vmlal_s16(accum0, d0.val[0], w0.val[0]);
        accum0 = vmlal_s16(accum0, d0.val[1], w0.val[1]);

        accum1 = vmlal_s16(accum1, d0.val[0], w1.val[0]);
        accum1 = vmlal_s16(accum1, d0.val[1], w1.val[1]);

        accum2 = vmlal_s16(accum2, d0.val[0], w2.val[0]);
        accum2 = vmlal_s16(accum2, d0.val[1], w2.val[1]);

        accum3 = vmlal_s16(accum3, d0.val[0], w3.val[0]);
        accum3 = vmlal_s16(accum3, d0.val[1], w3.val[1]);
    }

    int32x2_t sum0 = vpadd_s32(vget_low_s32(accum0), vget_high_s32(accum0));
    int32x2_t sum1 = vpadd_s32(vget_low_s32(accum1), vget_high_s32(accum1));
    int32x2_t sum2 = vpadd_s32(vget_low_s32(accum2), vget_high_s32(accum2));
    int32x2_t sum3 = vpadd_s32(vget_low_s32(accum3), vget_high_s32(accum3));
    sum0 = vpadd_s32(sum0, sum1);
    sum1 = vpadd_s32(sum2, sum3);
    int32x4_t sum = vcombine_s32(sum0, sum1);

    float32x4_t m0 = vcvtq_f32_s32(sum);

    m0 = vmulq_f32(m0, vld1q_f32(weightsf + 384/4));
    m0 = vaddq_f32(m0, vld1q_f32(weightsf + 400/4));

    float32x4_t m1, m2, m3, m4, m5, m6, m7;

    m1 = m0;

    m0 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m0), sign_bits_f_zero_l));
    m0 = vaddq_f32(m0, ones_f);
    m0 = vmulq_f32(reciprocal(m0), m1);

    m1 = vdupq_lane_f32(vget_low_f32(m0), 0);
    m2 = vdupq_lane_f32(vget_low_f32(m0), 1);
    m3 = vdupq_lane_f32(vget_high_f32(m0), 0);
    m4 = vdupq_lane_f32(vget_high_f32(m0), 1);

    m1 = vmulq_f32(m1, vld1q_f32(weightsf + 416/4));
    m2 = vmulq_f32(m2, vld1q_f32(weightsf + (416+16)/4));
    m3 = vmulq_f32(m3, vld1q_f32(weightsf + (416+32)/4));
    m4 = vmulq_f32(m4, vld1q_f32(weightsf + (416+48)/4));

    m1 = vaddq_f32(m1, m2);
    m3 = vaddq_f32(m3, m4);
    m1 = vaddq_f32(m1, m3);
    m1 = vaddq_f32(m1, vld1q_f32(weightsf + (416+64)/4));

    m7 = m1;
    m1 = vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(m1), sign_bits_f));
    m1 = vaddq_f32(m1, ones_f);
    m7 = vmulq_f32(reciprocal(m1), m7);

    m3 = m0;

    m0 = vdupq_lane_f32(vget_low_f32(m0), 0);
    m1 = vdupq_lane_f32(vget_low_f32(m3), 1);
    m2 = vdupq_lane_f32(vget_high_f32(m3), 0);
    m3 = vdupq_lane_f32(vget_high_f32(m3), 1);

    m0 = vmulq_f32(m0, vld1q_f32(weightsf + 496/4));
    m1 = vmulq_f32(m1, vld1q_f32(weightsf + (496+16)/4));
    m2 = vmulq_f32(m2, vld1q_f32(weightsf + (496+32)/4));
    m3 = vmulq_f32(m3, vld1q_f32(weightsf + (496+48)/4));

    m4 = vdupq_lane_f32(vget_low_f32(m7), 0);
    m5 = vdupq_lane_f32(vget_low_f32(m7), 1);
    m6 = vdupq_lane_f32(vget_high_f32(m7), 0);
    m7 = vdupq_lane_f32(vget_high_f32(m7), 1);

    m4 = vmulq_f32(m4, vld1q_f32(weightsf + (496+64)/4));
    m5 = vmulq_f32(m5, vld1q_f32(weightsf + (496+80)/4));
    m6 = vmulq_f32(m6, vld1q_f32(weightsf + (496+96)/4));
    m7 = vmulq_f32(m7, vld1q_f32(weightsf + (496+112)/4));

    m0 = vaddq_f32(m0, m1);
    m2 = vaddq_f32(m2, m3);
    m4 = vaddq_f32(m4, m5);
    m6 = vaddq_f32(m6, m7);

    m0 = vaddq_f32(m0, m2);
    m4 = vaddq_f32(m4, m6);
    m0 = vaddq_f32(m0, m4);

    m0 = vaddq_f32(m0, vld1q_f32(weightsf + (496+128)/4));

    float32x2_t maximum = vmax_f32(vget_low_f32(m0), vget_high_f32(m0));
    d[0] = (vget_lane_f32(maximum, 1) <= vget_lane_f32(maximum, 0));
}
Example #5
0
inline   int16x4x2_t vld2(const s16 * ptr) { return vld2_s16(ptr); }
Example #6
0
void test_vld2s16 (void)
{
  int16x4x2_t out_int16x4x2_t;

  out_int16x4x2_t = vld2_s16 (0);
}