Пример #1
0
static WEBP_INLINE uint32_t Average3(const uint32_t* const a,
                                     const uint32_t* const b,
                                     const uint32_t* const c) {
  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
  const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
  const uint8x8_t avg1 = vhadd_u8(a0, c0);
  const uint8x8_t avg2 = vhadd_u8(avg1, b0);
  return vget_lane_u32(vreinterpret_u32_u8(avg2), 0);
}
Пример #2
0
static WEBP_INLINE uint32_t Average4(const uint32_t* const a,
                                     const uint32_t* const b,
                                     const uint32_t* const c,
                                     const uint32_t* const d) {
  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
  const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
  const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d));
  const uint8x8_t avg1 = vhadd_u8(a0, b0);
  const uint8x8_t avg2 = vhadd_u8(c0, d0);
  const uint8x8_t avg3 = vhadd_u8(avg1, avg2);
  return vget_lane_u32(vreinterpret_u32_u8(avg3), 0);
}
Пример #3
0
static WEBP_INLINE uint32_t Average3(const uint32_t* const a,
                                     const uint32_t* const b,
                                     const uint32_t* const c) {
    const uint64x1_t a0 = { *a }, b0 = { *b }, c0 = { *c };
    const uint8x8_t a1 = vreinterpret_u8_u64(a0);
    const uint8x8_t b1 = vreinterpret_u8_u64(b0);
    const uint8x8_t c1 = vreinterpret_u8_u64(c0);
    const uint8x8_t avg1 = vhadd_u8(a1, c1);
    const uint8x8_t avg2 = vhadd_u8(avg1, b1);
    uint32_t ret;
    vst1_lane_u32(&ret, vreinterpret_u32_u8(avg2), 0);
    return ret;
}
Пример #4
0
static WEBP_INLINE uint32_t Average2(const uint32_t* const a,
                                     const uint32_t* const b) {
  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
  const uint8x8_t avg = vhadd_u8(a0, b0);
  return vget_lane_u32(vreinterpret_u32_u8(avg), 0);
}
Пример #5
0
void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
  const uint8x8_t XABCD_u8 = vld1_u8(above - 1);
  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
  const uint32x2_t zero = vdup_n_u32(0);
  const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0);
  const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL);
  const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8));
  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
}
Пример #6
0
static WEBP_INLINE uint32_t Average2(const uint32_t* const a,
                                     const uint32_t* const b) {
    const uint64x1_t a0 = { *a }, b0 = { *b };
    const uint8x8_t a1 = vreinterpret_u8_u64(a0);
    const uint8x8_t b1 = vreinterpret_u8_u64(b0);
    const uint8x8_t avg = vhadd_u8(a1, b1);
    uint32_t ret;
    vst1_lane_u32(&ret, vreinterpret_u32_u8(avg), 0);
    return ret;
}
Пример #7
0
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0,
                                                   const uint32_t* const c1,
                                                   const uint32_t* const c2) {
  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
  const uint8x8_t avg = vhadd_u8(p0, p1);                  // Average(c0,c1)
  const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1);    // (a-b)>>1 saturated
  const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1);    // (b-a)>>1 saturated
  const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba);
  return vget_lane_u32(vreinterpret_u32_u8(out), 0);
}
Пример #8
0
void
png_read_filter_row_avg4_neon(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_bytep rp = row;
   png_bytep rp_stop = row + row_info->rowbytes;
   png_const_bytep pp = prev_row;

   uint8x8x4_t vdest;
   vdest.val[3] = vdup_n_u8(0);

   png_debug(1, "in png_read_filter_row_avg4_neon");

   for (; rp < rp_stop; rp += 16, pp += 16)
   {
      uint32x2x4_t vtmp;
      uint8x8x4_t *vrpt, *vppt;
      uint8x8x4_t vrp, vpp;
      uint32x2x4_t *temp_pointer;

      vtmp = vld4_u32(png_ptr(uint32_t,rp));
      vrpt = png_ptr(uint8x8x4_t,&vtmp);
      vrp = *vrpt;
      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
      vppt = png_ptr(uint8x8x4_t,&vtmp);
      vpp = *vppt;

      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
      vdest.val[1] = vhadd_u8(vdest.val[0], vpp.val[1]);
      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
      vdest.val[2] = vhadd_u8(vdest.val[1], vpp.val[2]);
      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
      vdest.val[3] = vhadd_u8(vdest.val[2], vpp.val[3]);
      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);

      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
   }
}
Пример #9
0
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0,
        const uint32_t* const c1,
        const uint32_t* const c2) {
    const uint64x1_t C0 = { *c0, 0 }, C1 = { *c1, 0 }, C2 = { *c2, 0 };
    const uint8x8_t p0 = vreinterpret_u8_u64(C0);
    const uint8x8_t p1 = vreinterpret_u8_u64(C1);
    const uint8x8_t p2 = vreinterpret_u8_u64(C2);
    const uint8x8_t avg = vhadd_u8(p0, p1);                  // Average(c0,c1)
    const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1);    // (a-b)>>1 saturated
    const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1);    // (b-a)>>1 saturated
    const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba);
    uint32_t ret;
    vst1_lane_u32(&ret, vreinterpret_u32_u8(out), 0);
    return ret;
}
Пример #10
0
void vpx_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 };
  static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 };
  const uint8x8_t sh_12345677 = vld1_u8(shuffle1);
  const uint8x8_t sh_23456777 = vld1_u8(shuffle2);
  const uint8x8_t A0 = vld1_u8(above);  // top row
  const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677);
  const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777);
  const uint8x8_t avg1 = vhadd_u8(A0, A2);
  uint8x8_t row = vrhadd_u8(avg1, A1);
  int i;
  (void)left;
  for (i = 0; i < 7; ++i) {
    vst1_u8(dst + i * stride, row);
    row = vtbl1_u8(row, sh_12345677);
  }
  vst1_u8(dst + i * stride, row);
}
Пример #11
0
void vpx_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above));  // top row
  const uint64x1_t A1 = vshr_n_u64(A0, 8);
  const uint64x1_t A2 = vshr_n_u64(A0, 16);
  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00);
  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
  (void)left;
  vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0);
  vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0);
  vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0);
  vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0);
  dst[3 * stride + 3] = above[7];
}
Пример #12
0
void
png_read_filter_row_avg3_neon(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_bytep rp = row;
   png_const_bytep pp = prev_row;
   png_bytep rp_stop = row + row_info->rowbytes;

   uint8x16_t vtmp;
   uint8x8x2_t *vrpt;
   uint8x8x2_t vrp;
   uint8x8x4_t vdest;
   vdest.val[3] = vdup_n_u8(0);

   vtmp = vld1q_u8(rp);
   vrpt = png_ptr(uint8x8x2_t,&vtmp);
   vrp = *vrpt;

   png_debug(1, "in png_read_filter_row_avg3_neon");

   for (; rp < rp_stop; pp += 12)
   {
      uint8x8_t vtmp1, vtmp2, vtmp3;

      uint8x8x2_t *vppt;
      uint8x8x2_t vpp;

      uint32x2_t *temp_pointer;

      vtmp = vld1q_u8(pp);
      vppt = png_ptr(uint8x8x2_t,&vtmp);
      vpp = *vppt;

      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
      vdest.val[0] = vhadd_u8(vdest.val[3], vpp.val[0]);
      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);

      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
      vtmp3 = vext_u8(vrp.val[0], vrp.val[1], 6);
      vdest.val[1] = vhadd_u8(vdest.val[0], vtmp2);
      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);

      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 6);
      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);

      vtmp = vld1q_u8(rp + 12);
      vrpt = png_ptr(uint8x8x2_t,&vtmp);
      vrp = *vrpt;

      vdest.val[2] = vhadd_u8(vdest.val[1], vtmp2);
      vdest.val[2] = vadd_u8(vdest.val[2], vtmp3);

      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);

      vdest.val[3] = vhadd_u8(vdest.val[2], vtmp2);
      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);

      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
      rp += 3;
   }
}