示例#1
0
void test_vtrnQu32 (void)
{
  uint32x4x2_t out_uint32x4x2_t;
  uint32x4_t arg0_uint32x4_t;
  uint32x4_t arg1_uint32x4_t;

  out_uint32x4x2_t = vtrnq_u32 (arg0_uint32x4_t, arg1_uint32x4_t);
}
示例#2
0
uint32x4x2_t test_vtrnq_u32(uint32x4_t a, uint32x4_t b) {
  // CHECK-LABEL: test_vtrnq_u32
  return vtrnq_u32(a, b);
  // CHECK: trn1 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
  // CHECK: trn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
}
示例#3
0
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y,  // unused
                              int y_step_q4,            // unused
                              int w, int h) {
  int width;
  const uint8_t *s, *psrc;
  uint8_t *d, *pdst;
  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
  uint8x16_t q12u8, q13u8, q14u8, q15u8;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
  int16x4_t d24s16, d25s16, d26s16, d27s16;
  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
  int16x8_t q0s16;
  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
  int32x4_t q1s32, q2s32, q14s32, q15s32;
  uint16x8x2_t q0x2u16;
  uint8x8x2_t d0x2u8, d1x2u8;
  uint32x2x2_t d0x2u32;
  uint16x4x2_t d0x2u16, d1x2u16;
  uint32x4x2_t q0x2u32;

  assert(x_step_q4 == 16);

  (void)x_step_q4;
  (void)y_step_q4;
  (void)filter_y;

  q0s16 = vld1q_s16(filter_x);

  src -= 3;  // adjust for taps
  for (; h > 0; h -= 4, src += src_stride * 4,
                dst += dst_stride * 4) {  // loop_horiz_v
    s = src;
    d24u8 = vld1_u8(s);
    s += src_stride;
    d25u8 = vld1_u8(s);
    s += src_stride;
    d26u8 = vld1_u8(s);
    s += src_stride;
    d27u8 = vld1_u8(s);

    q12u8 = vcombine_u8(d24u8, d25u8);
    q13u8 = vcombine_u8(d26u8, d27u8);

    q0x2u16 =
        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
    d0x2u8 = vtrn_u8(d24u8, d25u8);
    d1x2u8 = vtrn_u8(d26u8, d27u8);

    __builtin_prefetch(src + src_stride * 4);
    __builtin_prefetch(src + src_stride * 5);
    __builtin_prefetch(src + src_stride * 6);

    q8u16 = vmovl_u8(d0x2u8.val[0]);
    q9u16 = vmovl_u8(d0x2u8.val[1]);
    q10u16 = vmovl_u8(d1x2u8.val[0]);
    q11u16 = vmovl_u8(d1x2u8.val[1]);

    d16u16 = vget_low_u16(q8u16);
    d17u16 = vget_high_u16(q8u16);
    d18u16 = vget_low_u16(q9u16);
    d19u16 = vget_high_u16(q9u16);
    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
    q9u16 = vcombine_u16(d17u16, d19u16);

    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
    for (width = w, psrc = src + 7, pdst = dst; width > 0;
         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
      s = psrc;
      d28u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d29u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d31u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d30u32 = vld1_dup_u32((const uint32_t *)s);

      __builtin_prefetch(psrc + 64);

      d0x2u16 =
          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
      d1x2u16 =
          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30

      __builtin_prefetch(psrc + 64 + src_stride);

      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
      q0x2u32 =
          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));

      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
      q12u16 = vmovl_u8(d28u8);
      q13u16 = vmovl_u8(d29u8);

      __builtin_prefetch(psrc + 64 + src_stride * 2);

      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));

      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
                             d23s16, d24s16, q0s16);
      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
                             d24s16, d26s16, q0s16);
      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
                              d26s16, d27s16, q0s16);
      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
                              d27s16, d25s16, q0s16);

      __builtin_prefetch(psrc + 60 + src_stride * 3);

      d2u16 = vqrshrun_n_s32(q1s32, 7);
      d3u16 = vqrshrun_n_s32(q2s32, 7);
      d4u16 = vqrshrun_n_s32(q14s32, 7);
      d5u16 = vqrshrun_n_s32(q15s32, 7);

      q1u16 = vcombine_u16(d2u16, d3u16);
      q2u16 = vcombine_u16(d4u16, d5u16);

      d2u8 = vqmovn_u16(q1u16);
      d3u8 = vqmovn_u16(q2u16);

      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
                         vreinterpret_u32_u16(d0x2u16.val[1]));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
                       vreinterpret_u8_u32(d0x2u32.val[1]));

      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);

      d = pdst;
      vst1_lane_u32((uint32_t *)d, d2u32, 0);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d3u32, 0);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d2u32, 1);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d3u32, 1);

      q8u16 = q9u16;
      d20s16 = d23s16;
      q11u16 = q12u16;
      q9u16 = q13u16;
      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    }
  }
  return;
}
示例#4
0
void vp8_mbloop_filter_vertical_edge_uv_neon(
        unsigned char *u,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh,
        unsigned char *v) {
    unsigned char *us, *ud;
    unsigned char *vs, *vd;
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;

    qblimit = vdupq_n_u8(blimit);
    qlimit = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);

    us = u - 4;
    vs = v - 4;
    d6 = vld1_u8(us);
    us += pitch;
    d7 = vld1_u8(vs);
    vs += pitch;
    d8 = vld1_u8(us);
    us += pitch;
    d9 = vld1_u8(vs);
    vs += pitch;
    d10 = vld1_u8(us);
    us += pitch;
    d11 = vld1_u8(vs);
    vs += pitch;
    d12 = vld1_u8(us);
    us += pitch;
    d13 = vld1_u8(vs);
    vs += pitch;
    d14 = vld1_u8(us);
    us += pitch;
    d15 = vld1_u8(vs);
    vs += pitch;
    d16 = vld1_u8(us);
    us += pitch;
    d17 = vld1_u8(vs);
    vs += pitch;
    d18 = vld1_u8(us);
    us += pitch;
    d19 = vld1_u8(vs);
    vs += pitch;
    d20 = vld1_u8(us);
    d21 = vld1_u8(vs);

    q3 = vcombine_u8(d6, d7);
    q4 = vcombine_u8(d8, d9);
    q5 = vcombine_u8(d10, d11);
    q6 = vcombine_u8(d12, d13);
    q7 = vcombine_u8(d14, d15);
    q8 = vcombine_u8(d16, d17);
    q9 = vcombine_u8(d18, d19);
    q10 = vcombine_u8(d20, d21);

    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));

    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));

    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));

    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];

    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q4, &q5, &q6, &q7, &q8, &q9);

    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));

    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));

    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));

    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];

    ud = u - 4;
    vst1_u8(ud, vget_low_u8(q3));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q4));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q5));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q6));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q7));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q8));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q9));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q10));

    vd = v - 4;
    vst1_u8(vd, vget_high_u8(q3));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q4));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q5));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q6));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q7));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q8));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q9));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q10));
    return;
}