static INLINE
uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
    uint8x8x4_t x;
    const uint8x8_t a = vld1_u8(src);
    const uint8x8_t b = vld1_u8(src + pitch * 1);
    const uint8x8_t c = vld1_u8(src + pitch * 2);
    const uint8x8_t d = vld1_u8(src + pitch * 3);
    const uint8x8_t e = vld1_u8(src + pitch * 4);
    const uint8x8_t f = vld1_u8(src + pitch * 5);
    const uint8x8_t g = vld1_u8(src + pitch * 6);
    const uint8x8_t h = vld1_u8(src + pitch * 7);
    const uint32x2x2_t r04_u32 = vtrn_u32(vreinterpret_u32_u8(a),
                                          vreinterpret_u32_u8(e));
    const uint32x2x2_t r15_u32 = vtrn_u32(vreinterpret_u32_u8(b),
                                          vreinterpret_u32_u8(f));
    const uint32x2x2_t r26_u32 = vtrn_u32(vreinterpret_u32_u8(c),
                                          vreinterpret_u32_u8(g));
    const uint32x2x2_t r37_u32 = vtrn_u32(vreinterpret_u32_u8(d),
                                          vreinterpret_u32_u8(h));
    const uint16x4x2_t r02_u16 = vtrn_u16(vreinterpret_u16_u32(r04_u32.val[0]),
                                          vreinterpret_u16_u32(r26_u32.val[0]));
    const uint16x4x2_t r13_u16 = vtrn_u16(vreinterpret_u16_u32(r15_u32.val[0]),
                                          vreinterpret_u16_u32(r37_u32.val[0]));
    const uint8x8x2_t r01_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[0]),
                                       vreinterpret_u8_u16(r13_u16.val[0]));
    const uint8x8x2_t r23_u8 = vtrn_u8(vreinterpret_u8_u16(r02_u16.val[1]),
                                       vreinterpret_u8_u16(r13_u16.val[1]));
    /*
     * after vtrn_u32
    00 01 02 03 | 40 41 42 43
    10 11 12 13 | 50 51 52 53
    20 21 22 23 | 60 61 62 63
    30 31 32 33 | 70 71 72 73
    ---
    * after vtrn_u16
    00 01 20 21 | 40 41 60 61
    02 03 22 23 | 42 43 62 63
    10 11 30 31 | 50 51 70 71
    12 13 32 33 | 52 52 72 73

    00 01 20 21 | 40 41 60 61
    10 11 30 31 | 50 51 70 71
    02 03 22 23 | 42 43 62 63
    12 13 32 33 | 52 52 72 73
    ---
    * after vtrn_u8
    00 10 20 30 | 40 50 60 70
    01 11 21 31 | 41 51 61 71
    02 12 22 32 | 42 52 62 72
    03 13 23 33 | 43 53 63 73
    */
    x.val[0] = r01_u8.val[0];
    x.val[1] = r01_u8.val[1];
    x.val[2] = r23_u8.val[0];
    x.val[3] = r23_u8.val[1];

    return x;
}
static INLINE void write_2x4(unsigned char *dst, int pitch,
                             const uint8x8x2_t result) {
    /*
     * uint8x8x2_t result
    00 01 02 03 | 04 05 06 07
    10 11 12 13 | 14 15 16 17
    ---
    * after vtrn_u8
    00 10 02 12 | 04 14 06 16
    01 11 03 13 | 05 15 07 17
    */
    const uint8x8x2_t r01_u8 = vtrn_u8(result.val[0],
                                       result.val[1]);
    const uint16x4_t x_0_4 = vreinterpret_u16_u8(r01_u8.val[0]);
    const uint16x4_t x_1_5 = vreinterpret_u16_u8(r01_u8.val[1]);
    vst1_lane_u16((uint16_t *)dst, x_0_4, 0);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_1_5, 0);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_0_4, 1);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_1_5, 1);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_0_4, 2);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_1_5, 2);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_0_4, 3);
    dst += pitch;
    vst1_lane_u16((uint16_t *)dst, x_1_5, 3);
}
void aom_lpf_vertical_8_neon(uint8_t *src, int pitch, const uint8_t *blimit,
                             const uint8_t *limit, const uint8_t *thresh) {
  int i;
  uint8_t *s;
  uint8x8_t dblimit, dlimit, dthresh;
  uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
  uint8x8_t d16u8, d17u8, d18u8;
  uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
  uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
  uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
  uint8x8x4_t d4Result;
  uint8x8x2_t d2Result;

  dblimit = vld1_u8(blimit);
  dlimit = vld1_u8(limit);
  dthresh = vld1_u8(thresh);

  for (i = 0; i < 1; i++) {
    s = src + (i * (pitch << 3)) - 4;

    d3u8 = vld1_u8(s);
    s += pitch;
    d4u8 = vld1_u8(s);
    s += pitch;
    d5u8 = vld1_u8(s);
    s += pitch;
    d6u8 = vld1_u8(s);
    s += pitch;
    d7u8 = vld1_u8(s);
    s += pitch;
    d16u8 = vld1_u8(s);
    s += pitch;
    d17u8 = vld1_u8(s);
    s += pitch;
    d18u8 = vld1_u8(s);

    d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8), vreinterpret_u32_u8(d7u8));
    d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8), vreinterpret_u32_u8(d16u8));
    d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8), vreinterpret_u32_u8(d17u8));
    d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8), vreinterpret_u32_u8(d18u8));

    d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
                      vreinterpret_u16_u32(d2tmp2.val[0]));
    d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
                      vreinterpret_u16_u32(d2tmp3.val[0]));
    d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
                      vreinterpret_u16_u32(d2tmp2.val[1]));
    d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
                      vreinterpret_u16_u32(d2tmp3.val[1]));

    d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
                     vreinterpret_u8_u16(d2tmp5.val[0]));
    d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
                     vreinterpret_u8_u16(d2tmp5.val[1]));
    d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
                      vreinterpret_u8_u16(d2tmp7.val[0]));
    d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
                      vreinterpret_u8_u16(d2tmp7.val[1]));

    d3u8 = d2tmp8.val[0];
    d4u8 = d2tmp8.val[1];
    d5u8 = d2tmp9.val[0];
    d6u8 = d2tmp9.val[1];
    d7u8 = d2tmp10.val[0];
    d16u8 = d2tmp10.val[1];
    d17u8 = d2tmp11.val[0];
    d18u8 = d2tmp11.val[1];

    mbloop_filter_neon(dblimit, dlimit, dthresh, d3u8, d4u8, d5u8, d6u8, d7u8,
                       d16u8, d17u8, d18u8, &d0u8, &d1u8, &d2u8, &d3u8, &d4u8,
                       &d5u8);

    d4Result.val[0] = d0u8;
    d4Result.val[1] = d1u8;
    d4Result.val[2] = d2u8;
    d4Result.val[3] = d3u8;

    d2Result.val[0] = d4u8;
    d2Result.val[1] = d5u8;

    s = src - 3;
    vst4_lane_u8(s, d4Result, 0);
    s += pitch;
    vst4_lane_u8(s, d4Result, 1);
    s += pitch;
    vst4_lane_u8(s, d4Result, 2);
    s += pitch;
    vst4_lane_u8(s, d4Result, 3);
    s += pitch;
    vst4_lane_u8(s, d4Result, 4);
    s += pitch;
    vst4_lane_u8(s, d4Result, 5);
    s += pitch;
    vst4_lane_u8(s, d4Result, 6);
    s += pitch;
    vst4_lane_u8(s, d4Result, 7);

    s = src + 1;
    vst2_lane_u8(s, d2Result, 0);
    s += pitch;
    vst2_lane_u8(s, d2Result, 1);
    s += pitch;
    vst2_lane_u8(s, d2Result, 2);
    s += pitch;
    vst2_lane_u8(s, d2Result, 3);
    s += pitch;
    vst2_lane_u8(s, d2Result, 4);
    s += pitch;
    vst2_lane_u8(s, d2Result, 5);
    s += pitch;
    vst2_lane_u8(s, d2Result, 6);
    s += pitch;
    vst2_lane_u8(s, d2Result, 7);
  }
  return;
}
Exemple #4
0
uint8x8x2_t test_vtrn_u8(uint8x8_t a, uint8x8_t b) {
  // CHECK-LABEL: test_vtrn_u8
  return vtrn_u8(a, b);
  // CHECK: trn1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
  // CHECK: trn2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
}
void vp9_lpf_vertical_4_neon(
        unsigned char *src,
        int pitch,
        unsigned char *blimit,
        unsigned char *limit,
        unsigned char *thresh,
        int count) {
    int i, pitch8;
    uint8_t *s;
    uint8x8_t dblimit, dlimit, dthresh;
    uint8x8_t d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8;
    uint32x2x2_t d2tmp0, d2tmp1, d2tmp2, d2tmp3;
    uint16x4x2_t d2tmp4, d2tmp5, d2tmp6, d2tmp7;
    uint8x8x2_t d2tmp8, d2tmp9, d2tmp10, d2tmp11;
    uint8x8x4_t d4Result;

    if (count == 0)  // end_vp9_lf_h_edge
        return;

    dblimit = vld1_u8(blimit);
    dlimit = vld1_u8(limit);
    dthresh = vld1_u8(thresh);

    pitch8 = pitch * 8;
    for (i = 0; i < count; i++, src += pitch8) {
        s = src - (i + 1) * 4;

        d3u8 = vld1_u8(s);
        s += pitch;
        d4u8 = vld1_u8(s);
        s += pitch;
        d5u8 = vld1_u8(s);
        s += pitch;
        d6u8 = vld1_u8(s);
        s += pitch;
        d7u8 = vld1_u8(s);
        s += pitch;
        d16u8 = vld1_u8(s);
        s += pitch;
        d17u8 = vld1_u8(s);
        s += pitch;
        d18u8 = vld1_u8(s);

        d2tmp0 = vtrn_u32(vreinterpret_u32_u8(d3u8),
                      vreinterpret_u32_u8(d7u8));
        d2tmp1 = vtrn_u32(vreinterpret_u32_u8(d4u8),
                      vreinterpret_u32_u8(d16u8));
        d2tmp2 = vtrn_u32(vreinterpret_u32_u8(d5u8),
                      vreinterpret_u32_u8(d17u8));
        d2tmp3 = vtrn_u32(vreinterpret_u32_u8(d6u8),
                      vreinterpret_u32_u8(d18u8));

        d2tmp4 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[0]),
                          vreinterpret_u16_u32(d2tmp2.val[0]));
        d2tmp5 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[0]),
                          vreinterpret_u16_u32(d2tmp3.val[0]));
        d2tmp6 = vtrn_u16(vreinterpret_u16_u32(d2tmp0.val[1]),
                          vreinterpret_u16_u32(d2tmp2.val[1]));
        d2tmp7 = vtrn_u16(vreinterpret_u16_u32(d2tmp1.val[1]),
                          vreinterpret_u16_u32(d2tmp3.val[1]));

        d2tmp8 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[0]),
                         vreinterpret_u8_u16(d2tmp5.val[0]));
        d2tmp9 = vtrn_u8(vreinterpret_u8_u16(d2tmp4.val[1]),
                         vreinterpret_u8_u16(d2tmp5.val[1]));
        d2tmp10 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[0]),
                          vreinterpret_u8_u16(d2tmp7.val[0]));
        d2tmp11 = vtrn_u8(vreinterpret_u8_u16(d2tmp6.val[1]),
                          vreinterpret_u8_u16(d2tmp7.val[1]));

        d3u8 = d2tmp8.val[0];
        d4u8 = d2tmp8.val[1];
        d5u8 = d2tmp9.val[0];
        d6u8 = d2tmp9.val[1];
        d7u8 = d2tmp10.val[0];
        d16u8 = d2tmp10.val[1];
        d17u8 = d2tmp11.val[0];
        d18u8 = d2tmp11.val[1];

        vp9_loop_filter_neon(dblimit, dlimit, dthresh,
                             d3u8, d4u8, d5u8, d6u8, d7u8, d16u8, d17u8, d18u8,
                             &d4u8, &d5u8, &d6u8, &d7u8);

        d4Result.val[0] = d4u8;
        d4Result.val[1] = d5u8;
        d4Result.val[2] = d6u8;
        d4Result.val[3] = d7u8;

        src -= 2;
        vst4_lane_u8(src, d4Result, 0);
        src += pitch;
        vst4_lane_u8(src, d4Result, 1);
        src += pitch;
        vst4_lane_u8(src, d4Result, 2);
        src += pitch;
        vst4_lane_u8(src, d4Result, 3);
        src += pitch;
        vst4_lane_u8(src, d4Result, 4);
        src += pitch;
        vst4_lane_u8(src, d4Result, 5);
        src += pitch;
        vst4_lane_u8(src, d4Result, 6);
        src += pitch;
        vst4_lane_u8(src, d4Result, 7);
    }
    return;
}
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y,  // unused
                              int y_step_q4,            // unused
                              int w, int h) {
  int width;
  const uint8_t *s, *psrc;
  uint8_t *d, *pdst;
  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
  uint8x16_t q12u8, q13u8, q14u8, q15u8;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
  int16x4_t d24s16, d25s16, d26s16, d27s16;
  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
  int16x8_t q0s16;
  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
  int32x4_t q1s32, q2s32, q14s32, q15s32;
  uint16x8x2_t q0x2u16;
  uint8x8x2_t d0x2u8, d1x2u8;
  uint32x2x2_t d0x2u32;
  uint16x4x2_t d0x2u16, d1x2u16;
  uint32x4x2_t q0x2u32;

  assert(x_step_q4 == 16);

  (void)x_step_q4;
  (void)y_step_q4;
  (void)filter_y;

  q0s16 = vld1q_s16(filter_x);

  src -= 3;  // adjust for taps
  for (; h > 0; h -= 4, src += src_stride * 4,
                dst += dst_stride * 4) {  // loop_horiz_v
    s = src;
    d24u8 = vld1_u8(s);
    s += src_stride;
    d25u8 = vld1_u8(s);
    s += src_stride;
    d26u8 = vld1_u8(s);
    s += src_stride;
    d27u8 = vld1_u8(s);

    q12u8 = vcombine_u8(d24u8, d25u8);
    q13u8 = vcombine_u8(d26u8, d27u8);

    q0x2u16 =
        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
    d0x2u8 = vtrn_u8(d24u8, d25u8);
    d1x2u8 = vtrn_u8(d26u8, d27u8);

    __builtin_prefetch(src + src_stride * 4);
    __builtin_prefetch(src + src_stride * 5);
    __builtin_prefetch(src + src_stride * 6);

    q8u16 = vmovl_u8(d0x2u8.val[0]);
    q9u16 = vmovl_u8(d0x2u8.val[1]);
    q10u16 = vmovl_u8(d1x2u8.val[0]);
    q11u16 = vmovl_u8(d1x2u8.val[1]);

    d16u16 = vget_low_u16(q8u16);
    d17u16 = vget_high_u16(q8u16);
    d18u16 = vget_low_u16(q9u16);
    d19u16 = vget_high_u16(q9u16);
    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
    q9u16 = vcombine_u16(d17u16, d19u16);

    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
    for (width = w, psrc = src + 7, pdst = dst; width > 0;
         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
      s = psrc;
      d28u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d29u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d31u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d30u32 = vld1_dup_u32((const uint32_t *)s);

      __builtin_prefetch(psrc + 64);

      d0x2u16 =
          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
      d1x2u16 =
          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30

      __builtin_prefetch(psrc + 64 + src_stride);

      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
      q0x2u32 =
          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));

      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
      q12u16 = vmovl_u8(d28u8);
      q13u16 = vmovl_u8(d29u8);

      __builtin_prefetch(psrc + 64 + src_stride * 2);

      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));

      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
                             d23s16, d24s16, q0s16);
      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
                             d24s16, d26s16, q0s16);
      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
                              d26s16, d27s16, q0s16);
      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
                              d27s16, d25s16, q0s16);

      __builtin_prefetch(psrc + 60 + src_stride * 3);

      d2u16 = vqrshrun_n_s32(q1s32, 7);
      d3u16 = vqrshrun_n_s32(q2s32, 7);
      d4u16 = vqrshrun_n_s32(q14s32, 7);
      d5u16 = vqrshrun_n_s32(q15s32, 7);

      q1u16 = vcombine_u16(d2u16, d3u16);
      q2u16 = vcombine_u16(d4u16, d5u16);

      d2u8 = vqmovn_u16(q1u16);
      d3u8 = vqmovn_u16(q2u16);

      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
                         vreinterpret_u32_u16(d0x2u16.val[1]));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
                       vreinterpret_u8_u32(d0x2u32.val[1]));

      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);

      d = pdst;
      vst1_lane_u32((uint32_t *)d, d2u32, 0);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d3u32, 0);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d2u32, 1);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d3u32, 1);

      q8u16 = q9u16;
      d20s16 = d23s16;
      q11u16 = q12u16;
      q9u16 = q13u16;
      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    }
  }
  return;
}