// Process a block which is a mutiple of 16 wide and any height.
static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
                                       uint8_t *output_ptr,
                                       unsigned int src_pixels_per_line,
                                       int pixel_step,
                                       unsigned int output_height,
                                       unsigned int output_width,
                                       const uint8_t *filter) {
  const uint8x8_t f0 = vdup_n_u8(filter[0]);
  const uint8x8_t f1 = vdup_n_u8(filter[1]);
  unsigned int i, j;
  for (i = 0; i < output_height; ++i) {
    for (j = 0; j < output_width; j += 16) {
      const uint8x16_t src_0 = vld1q_u8(&src_ptr[j]);
      const uint8x16_t src_1 = vld1q_u8(&src_ptr[j + pixel_step]);
      const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
      const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
      const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
      const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
      const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
      const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
      vst1q_u8(output_ptr + j, vcombine_u8(out_lo, out_hi));
    }
    src_ptr += src_pixels_per_line;
    output_ptr += output_width;
  }
}
Example #2
0
int main(void)
{
    uint8_t v1_init[8] = {1, 1, 1, 1, 1, 1, 1, 1};
    uint8_t v2_init[8] = {2, 2, 2, 2, 2, 2, 2, 2};
    uint8x8_t v1 = vld1_u8 (v1_init);
    uint8x8_t v2 = vld1_u8 (v2_init);
    uint8x8x2_t vd1, vd2;
    union {uint8x8_t v; uint8_t buf[8];} d1, d2, d3, d4;
    int i;
    uint8_t odd, even;

    vd1 = vzip_u8(v1, vdup_n_u8(0));
    vd2 = vzip_u8(v2, vdup_n_u8(0));

    vst1_u8(d1.buf, vd1.val[0]);
    vst1_u8(d2.buf, vd1.val[1]);
    vst1_u8(d3.buf, vd2.val[0]);
    vst1_u8(d4.buf, vd2.val[1]);

#ifdef __ARMEL__
    odd = 1;
    even = 0;
#else
    odd = 0;
    even = 1;
#endif

    for (i = 0; i < 8; i++)
      if ((i % 2 == even && d4.buf[i] != 2)
          || (i % 2 == odd && d4.buf[i] != 0))
         abort ();

    return 0;
}
void SkBlitLCD16Row_neon(SkPMColor dst[], const uint16_t src[],
                                   SkColor color, int width, SkPMColor) {
    int colA = SkColorGetA(color);
    int colR = SkColorGetR(color);
    int colG = SkColorGetG(color);
    int colB = SkColorGetB(color);

    colA = SkAlpha255To256(colA);

    uint8x8_t vcolR, vcolG, vcolB;
    uint16x8_t vcolA;

    if (width >= 8) {
        vcolA = vdupq_n_u16(colA);
        vcolR = vdup_n_u8(colR);
        vcolG = vdup_n_u8(colG);
        vcolB = vdup_n_u8(colB);
    }

    while (width >= 8) {
        uint8x8x4_t vdst;
        uint16x8_t vmask;
        uint16x8_t vmaskR, vmaskG, vmaskB;

        vdst = vld4_u8((uint8_t*)dst);
        vmask = vld1q_u16(src);

        // Get all the color masks on 5 bits
        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                             SK_B16_BITS + SK_R16_BITS + 1);
        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

        // Upscale to 0..32
        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);

        vmaskR = vshrq_n_u16(vmaskR * vcolA, 8);
        vmaskG = vshrq_n_u16(vmaskG * vcolA, 8);
        vmaskB = vshrq_n_u16(vmaskB * vcolA, 8);

        vdst.val[NEON_A] = vdup_n_u8(0xFF);
        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);

        vst4_u8((uint8_t*)dst, vdst);

        dst += 8;
        src += 8;
        width -= 8;
    }

    for (int i = 0; i < width; i++) {
        dst[i] = SkBlendLCD16(colA, colR, colG, colB, dst[i], src[i]);
    }
}
void
ComputesRGBLuminanceMask_NEON(const uint8_t *aSourceData,
                              int32_t aSourceStride,
                              uint8_t *aDestData,
                              int32_t aDestStride,
                              const IntSize &aSize,
                              float aOpacity)
{
  int32_t redFactor = 55 * aOpacity; // 255 * 0.2125 * opacity
  int32_t greenFactor = 183 * aOpacity; // 255 * 0.7154 * opacity
  int32_t blueFactor = 18 * aOpacity; // 255 * 0.0721
  const uint8_t *sourcePixel = aSourceData;
  int32_t sourceOffset = aSourceStride - 4 * aSize.width;
  uint8_t *destPixel = aDestData;
  int32_t destOffset = aDestStride - aSize.width;

  sourcePixel = aSourceData;
  int32_t remainderWidth = aSize.width % 8;
  int32_t roundedWidth = aSize.width - remainderWidth;
  uint16x8_t temp;
  uint8x8_t gray;
  uint8x8_t redVector = vdup_n_u8(redFactor);
  uint8x8_t greenVector = vdup_n_u8(greenFactor);
  uint8x8_t blueVector = vdup_n_u8(blueFactor);
  uint8x8_t fullBitVector = vdup_n_u8(255);
  uint8x8_t oneVector = vdup_n_u8(1);
  for (int32_t y = 0; y < aSize.height; y++) {
    // Calculate luminance by neon with 8 pixels per loop
    for (int32_t x = 0; x < roundedWidth; x += 8) {
      uint8x8x4_t argb  = vld4_u8(sourcePixel);
      temp = vmull_u8(argb.val[GFX_ARGB32_OFFSET_R], redVector); // temp = red * redFactor
      temp = vmlal_u8(temp, argb.val[GFX_ARGB32_OFFSET_G], greenVector); // temp += green * greenFactor
      temp = vmlal_u8(temp, argb.val[GFX_ARGB32_OFFSET_B], blueVector); // temp += blue * blueFactor
      gray = vshrn_n_u16(temp, 8); // gray = temp >> 8

      // Check alpha value
      uint8x8_t alphaVector = vtst_u8(argb.val[GFX_ARGB32_OFFSET_A], fullBitVector);
      gray = vmul_u8(gray, vand_u8(alphaVector, oneVector));

      // Put the result to the 8 pixels
      vst1_u8(destPixel, gray);
      sourcePixel += 8 * 4;
      destPixel += 8;
    }

    // Calculate the rest pixels of the line by cpu
    for (int32_t x = 0; x < remainderWidth; x++) {
      if (sourcePixel[GFX_ARGB32_OFFSET_A] > 0) {
        *destPixel = (redFactor * sourcePixel[GFX_ARGB32_OFFSET_R]+
                      greenFactor * sourcePixel[GFX_ARGB32_OFFSET_G] +
                      blueFactor * sourcePixel[GFX_ARGB32_OFFSET_B]) >> 8;
      } else {
        *destPixel = 0;
      }
      sourcePixel += 4;
      destPixel++;
    }
void
png_read_filter_row_sub4_neon(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_bytep rp = row;
   png_bytep rp_stop = row + row_info->rowbytes;

   uint8x8x4_t vdest;
   vdest.val[3] = vdup_n_u8(0);

   png_debug(1, "in png_read_filter_row_sub4_neon");

   for (; rp < rp_stop; rp += 16)
   {
      uint32x2x4_t vtmp = vld4_u32(png_ptr(uint32_t,rp));
      uint8x8x4_t *vrpt = png_ptr(uint8x8x4_t,&vtmp);
      uint8x8x4_t vrp = *vrpt;
      uint32x2x4_t *temp_pointer;

      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
      vdest.val[1] = vadd_u8(vdest.val[0], vrp.val[1]);
      vdest.val[2] = vadd_u8(vdest.val[1], vrp.val[2]);
      vdest.val[3] = vadd_u8(vdest.val[2], vrp.val[3]);
      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
   }

   PNG_UNUSED(prev_row)
}
Example #6
0
void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint64x1_t d1u64 = vdup_n_u64(0);
  (void)above;

  d1u64 = vld1_u64((const uint64_t *)left);

  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
  vst1_u8(dst, d0u8);
}
void test_vdup_nu8 (void)
{
  uint8x8_t out_uint8x8_t;
  uint8_t arg0_uint8_t;

  out_uint8x8_t = vdup_n_u8 (arg0_uint8_t);
}
/**
 * @brief Grayscale 변환을 수행하는 thread 함수. NEON 명령어를 사용한다.
 *
 * @param arg 계산해야할 정보가 담겨있는 _thread_data 형식의 구조체
 *
 * @return NULL
 */
static void *thread_calc(void *arg)
{
    struct _thread_data *data = (struct _thread_data *)arg;

    uint8x8_t rfac = vdup_n_u8 (76);
    uint8x8_t gfac = vdup_n_u8 (151);
    uint8x8_t bfac = vdup_n_u8 (29);
    int n = data->size / 8;
    int m = data->size % 8;

    int iTemp;
    unsigned char szTemp[8];

    unsigned int *data_in  = data->data_in;
    unsigned int *data_out = data->data_out;

    // 한 루프당 8픽셀씩 변환 (32 bytes)
    while (n--) {
        uint16x8_t  temp;
        uint8x8x4_t rgb  = vld4_u8 ((unsigned char *)data_in);
        uint8x8_t result;

        temp = vmull_u8(rgb.val[0],       bfac);
        temp = vmlal_u8(temp, rgb.val[1], gfac);
        temp = vmlal_u8(temp, rgb.val[2], rfac);

        result = vshrn_n_u16 (temp, 8);
        vst1_u8(szTemp, result);

        iTemp = szTemp[0]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[1]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[2]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[3]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[4]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[5]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[6]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
        iTemp = szTemp[7]; *data_out = iTemp | (iTemp << 8) | (iTemp << 16) | (*data_in & 0xff000000); data_in++; data_out++;
    }

    if (m) {
        argb8888_to_gray(data_in, data_out, m);
    }

    return NULL;
}
Example #9
0
void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
  int i;
  uint8x8_t d0u8 = vdup_n_u8(0);
  (void)left;

  d0u8 = vld1_u8(above);
  for (i = 0; i < 8; i++, dst += stride) vst1_u8(dst, d0u8);
}
void
png_read_filter_row_paeth4_neon(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_bytep rp = row;
   png_bytep rp_stop = row + row_info->rowbytes;
   png_const_bytep pp = prev_row;

   uint8x8_t vlast = vdup_n_u8(0);
   uint8x8x4_t vdest;
   vdest.val[3] = vdup_n_u8(0);

   png_debug(1, "in png_read_filter_row_paeth4_neon");

   for (; rp < rp_stop; rp += 16, pp += 16)
   {
      uint32x2x4_t vtmp;
      uint8x8x4_t *vrpt, *vppt;
      uint8x8x4_t vrp, vpp;
      uint32x2x4_t *temp_pointer;

      vtmp = vld4_u32(png_ptr(uint32_t,rp));
      vrpt = png_ptr(uint8x8x4_t,&vtmp);
      vrp = *vrpt;
      vtmp = vld4_u32(png_ptrc(uint32_t,pp));
      vppt = png_ptr(uint8x8x4_t,&vtmp);
      vpp = *vppt;

      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);
      vdest.val[1] = paeth(vdest.val[0], vpp.val[1], vpp.val[0]);
      vdest.val[1] = vadd_u8(vdest.val[1], vrp.val[1]);
      vdest.val[2] = paeth(vdest.val[1], vpp.val[2], vpp.val[1]);
      vdest.val[2] = vadd_u8(vdest.val[2], vrp.val[2]);
      vdest.val[3] = paeth(vdest.val[2], vpp.val[3], vpp.val[2]);
      vdest.val[3] = vadd_u8(vdest.val[3], vrp.val[3]);

      vlast = vpp.val[3];

      vst4_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2x4_t,&vdest), 0);
   }
}
Example #11
0
// Process a block exactly 8 wide and any height.
static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                      uint8_t *output_ptr,
                                      unsigned int src_pixels_per_line,
                                      int pixel_step,
                                      unsigned int output_height,
                                      const uint8_t *filter) {
  const uint8x8_t f0 = vdup_n_u8(filter[0]);
  const uint8x8_t f1 = vdup_n_u8(filter[1]);
  unsigned int i;
  for (i = 0; i < output_height; ++i) {
    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
    const uint16x8_t a = vmull_u8(src_0, f0);
    const uint16x8_t b = vmlal_u8(a, src_1, f1);
    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
    vst1_u8(output_ptr, out);
    src_ptr += src_pixels_per_line;
    output_ptr += 8;
  }
}
Example #12
0
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride,
                            const uint8_t *above, const uint8_t *left,
                            int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x16_t A0 = vld1q_u8(above);  // top row
    const uint8x16_t A1 = vld1q_u8(above + 16);
    const uint16x8_t p0 = vpaddlq_u8(A0);  // cascading summation of the top
    const uint16x8_t p1 = vpaddlq_u8(A1);
    const uint16x8_t p2 = vaddq_u16(p0, p1);
    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
    const uint16x4_t p4 = vpadd_u16(p3, p3);
    const uint16x4_t p5 = vpadd_u16(p4, p4);
    sum_top = vcombine_u16(p5, p5);
  }

  if (do_left) {
    const uint8x16_t L0 = vld1q_u8(left);  // left row
    const uint8x16_t L1 = vld1q_u8(left + 16);
    const uint16x8_t p0 = vpaddlq_u8(L0);  // cascading summation of the left
    const uint16x8_t p1 = vpaddlq_u8(L1);
    const uint16x8_t p2 = vaddq_u16(p0, p1);
    const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2));
    const uint16x4_t p4 = vpadd_u16(p3, p3);
    const uint16x4_t p5 = vpadd_u16(p4, p4);
    sum_left = vcombine_u16(p5, p5);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 6);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 5);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 5);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 32; ++i) {
      vst1q_u8(dst + i * stride, dc);
      vst1q_u8(dst + i * stride + 16, dc);
    }
  }
}
Example #13
0
void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
                                const uint8_t *above, const uint8_t *left) {
  int j, k;
  uint8x8_t d2u8 = vdup_n_u8(0);
  uint8x16_t q0u8 = vdupq_n_u8(0);
  uint8x16_t q1u8 = vdupq_n_u8(0);
  (void)above;

  for (k = 0; k < 2; k++, left += 16) {
    q1u8 = vld1q_u8(left);
    d2u8 = vget_low_u8(q1u8);
    for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
      q0u8 = vdupq_lane_u8(d2u8, 0);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 1);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 2);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 3);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 4);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 5);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 6);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
      q0u8 = vdupq_lane_u8(d2u8, 7);
      vst1q_u8(dst, q0u8);
      vst1q_u8(dst + 16, q0u8);
      dst += stride;
    }
  }
}
void
png_read_filter_row_sub3_neon(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_bytep rp = row;
   png_bytep rp_stop = row + row_info->rowbytes;

   uint8x16_t vtmp = vld1q_u8(rp);
   uint8x8x2_t *vrpt = png_ptr(uint8x8x2_t, &vtmp);
   uint8x8x2_t vrp = *vrpt;

   uint8x8x4_t vdest;
   vdest.val[3] = vdup_n_u8(0);

   png_debug(1, "in png_read_filter_row_sub3_neon");

   for (; rp < rp_stop;)
   {
      uint8x8_t vtmp1, vtmp2;
      uint32x2_t *temp_pointer;

      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
      vdest.val[0] = vadd_u8(vdest.val[3], vrp.val[0]);
      vtmp2 = vext_u8(vrp.val[0], vrp.val[1], 6);
      vdest.val[1] = vadd_u8(vdest.val[0], vtmp1);

      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
      vdest.val[2] = vadd_u8(vdest.val[1], vtmp2);
      vdest.val[3] = vadd_u8(vdest.val[2], vtmp1);

      vtmp = vld1q_u8(rp + 12);
      vrpt = png_ptr(uint8x8x2_t, &vtmp);
      vrp = *vrpt;

      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
      rp += 3;
   }

   PNG_UNUSED(prev_row)
}
Example #15
0
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride,
                            const uint8_t *above, const uint8_t *left,
                            int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x16_t A = vld1q_u8(above);  // top row
    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
    sum_top = vcombine_u16(p3, p3);
  }

  if (do_left) {
    const uint8x16_t L = vld1q_u8(left);  // left row
    const uint16x8_t p0 = vpaddlq_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    const uint16x4_t p3 = vpadd_u16(p2, p2);
    sum_left = vcombine_u16(p3, p3);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 5);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 4);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 4);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 16; ++i) {
      vst1q_u8(dst + i * stride, dc);
    }
  }
}
Example #16
0
void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left) {
  int i;
  uint16x8_t q1u16, q3u16;
  int16x8_t q1s16;
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint32x2_t d2u32 = vdup_n_u32(0);

  d0u8 = vld1_dup_u8(above - 1);
  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
  for (i = 0; i < 4; i++, dst += stride) {
    q1u16 = vdupq_n_u16((uint16_t)left[i]);
    q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
                      vreinterpretq_s16_u16(q3u16));
    d0u8 = vqmovun_s16(q1s16);
    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  }
}
Example #17
0
// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
                          const uint8_t *above, const uint8_t *left,
                          int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x8_t A = vld1_u8(above);  // top row
    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    sum_top = vcombine_u16(p2, p2);
  }

  if (do_left) {
    const uint8x8_t L = vld1_u8(left);  // left border
    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    sum_left = vcombine_u16(p2, p2);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 4);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 3);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 3);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 8; ++i) {
      vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
    }
  }
}
Example #18
0
void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint32x2_t d1u32 = vdup_n_u32(0);
  (void)above;

  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);

  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
}
static INLINE uint8x8x4_t read_4x8(unsigned char *src, int pitch) {
  uint8x8x4_t x;
  x.val[0] = x.val[1] = x.val[2] = x.val[3] = vdup_n_u8(0);
  x = vld4_lane_u8(src, x, 0);
  src += pitch;
  x = vld4_lane_u8(src, x, 1);
  src += pitch;
  x = vld4_lane_u8(src, x, 2);
  src += pitch;
  x = vld4_lane_u8(src, x, 3);
  src += pitch;
  x = vld4_lane_u8(src, x, 4);
  src += pitch;
  x = vld4_lane_u8(src, x, 5);
  src += pitch;
  x = vld4_lane_u8(src, x, 6);
  src += pitch;
  x = vld4_lane_u8(src, x, 7);
  return x;
}
unsigned int vp8_sub_pixel_variance16x16_neon_func(
        const unsigned char *src_ptr,
        int src_pixels_per_line,
        int xoffset,
        int yoffset,
        const unsigned char *dst_ptr,
        int dst_pixels_per_line,
        unsigned int *sse) {
    int i;
    DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
    unsigned char *tmpp;
    unsigned char *tmpp2;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
    uint8x8_t d19u8, d20u8, d21u8;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64, d2s64, d3s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
    uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;

    tmpp2 = tmp + 272;
    tmpp = tmp;
    if (xoffset == 0) {  // secondpass_bfilter16x16_only
        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);

        q11u8 = vld1q_u8(src_ptr);
        src_ptr += src_pixels_per_line;
        for (i = 4; i > 0; i--) {
            q12u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q13u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q14u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q15u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;

            __builtin_prefetch(src_ptr);
            __builtin_prefetch(src_ptr + src_pixels_per_line);
            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);

            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);

            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);

            d2u8 = vqrshrn_n_u16(q1u16, 7);
            d3u8 = vqrshrn_n_u16(q2u16, 7);
            d4u8 = vqrshrn_n_u16(q3u16, 7);
            d5u8 = vqrshrn_n_u16(q4u16, 7);
            d6u8 = vqrshrn_n_u16(q5u16, 7);
            d7u8 = vqrshrn_n_u16(q6u16, 7);
            d8u8 = vqrshrn_n_u16(q7u16, 7);
            d9u8 = vqrshrn_n_u16(q8u16, 7);

            q1u8 = vcombine_u8(d2u8, d3u8);
            q2u8 = vcombine_u8(d4u8, d5u8);
            q3u8 = vcombine_u8(d6u8, d7u8);
            q4u8 = vcombine_u8(d8u8, d9u8);

            q11u8 = q15u8;

            vst1q_u8((uint8_t *)tmpp2, q1u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q2u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q3u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q4u8);
            tmpp2 += 16;
        }
    } else if (yoffset == 0) {  // firstpass_bfilter16x16_only
        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);

        for (i = 4; i > 0 ; i--) {
            d2u8 = vld1_u8(src_ptr);
            d3u8 = vld1_u8(src_ptr + 8);
            d4u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d5u8 = vld1_u8(src_ptr);
            d6u8 = vld1_u8(src_ptr + 8);
            d7u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d8u8 = vld1_u8(src_ptr);
            d9u8 = vld1_u8(src_ptr + 8);
            d10u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d11u8 = vld1_u8(src_ptr);
            d12u8 = vld1_u8(src_ptr + 8);
            d13u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;

            __builtin_prefetch(src_ptr);
            __builtin_prefetch(src_ptr + src_pixels_per_line);
            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);

            q7u16  = vmull_u8(d2u8, d0u8);
            q8u16  = vmull_u8(d3u8, d0u8);
            q9u16  = vmull_u8(d5u8, d0u8);
            q10u16 = vmull_u8(d6u8, d0u8);
            q11u16 = vmull_u8(d8u8, d0u8);
            q12u16 = vmull_u8(d9u8, d0u8);
            q13u16 = vmull_u8(d11u8, d0u8);
            q14u16 = vmull_u8(d12u8, d0u8);

            d2u8  = vext_u8(d2u8, d3u8, 1);
            d5u8  = vext_u8(d5u8, d6u8, 1);
            d8u8  = vext_u8(d8u8, d9u8, 1);
            d11u8 = vext_u8(d11u8, d12u8, 1);

            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);

            d3u8  = vext_u8(d3u8, d4u8, 1);
            d6u8  = vext_u8(d6u8, d7u8, 1);
            d9u8  = vext_u8(d9u8, d10u8, 1);
            d12u8 = vext_u8(d12u8, d13u8, 1);

            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);

            d14u8 = vqrshrn_n_u16(q7u16, 7);
            d15u8 = vqrshrn_n_u16(q8u16, 7);
            d16u8 = vqrshrn_n_u16(q9u16, 7);
            d17u8 = vqrshrn_n_u16(q10u16, 7);
            d18u8 = vqrshrn_n_u16(q11u16, 7);
            d19u8 = vqrshrn_n_u16(q12u16, 7);
            d20u8 = vqrshrn_n_u16(q13u16, 7);
            d21u8 = vqrshrn_n_u16(q14u16, 7);

            q7u8  = vcombine_u8(d14u8, d15u8);
            q8u8  = vcombine_u8(d16u8, d17u8);
            q9u8  = vcombine_u8(d18u8, d19u8);
            q10u8 = vcombine_u8(d20u8, d21u8);

            vst1q_u8((uint8_t *)tmpp2, q7u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q8u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q9u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q10u8);
            tmpp2 += 16;
        }
    } else {
        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);

        d2u8 = vld1_u8(src_ptr);
        d3u8 = vld1_u8(src_ptr + 8);
        d4u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d5u8 = vld1_u8(src_ptr);
        d6u8 = vld1_u8(src_ptr + 8);
        d7u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d8u8 = vld1_u8(src_ptr);
        d9u8 = vld1_u8(src_ptr + 8);
        d10u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d11u8 = vld1_u8(src_ptr);
        d12u8 = vld1_u8(src_ptr + 8);
        d13u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;

        // First Pass: output_height lines x output_width columns (17x16)
        for (i = 3; i > 0; i--) {
            q7u16  = vmull_u8(d2u8, d0u8);
            q8u16  = vmull_u8(d3u8, d0u8);
            q9u16  = vmull_u8(d5u8, d0u8);
            q10u16 = vmull_u8(d6u8, d0u8);
            q11u16 = vmull_u8(d8u8, d0u8);
            q12u16 = vmull_u8(d9u8, d0u8);
            q13u16 = vmull_u8(d11u8, d0u8);
            q14u16 = vmull_u8(d12u8, d0u8);

            d2u8  = vext_u8(d2u8, d3u8, 1);
            d5u8  = vext_u8(d5u8, d6u8, 1);
            d8u8  = vext_u8(d8u8, d9u8, 1);
            d11u8 = vext_u8(d11u8, d12u8, 1);

            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);

            d3u8  = vext_u8(d3u8, d4u8, 1);
            d6u8  = vext_u8(d6u8, d7u8, 1);
            d9u8  = vext_u8(d9u8, d10u8, 1);
            d12u8 = vext_u8(d12u8, d13u8, 1);

            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);

            d14u8 = vqrshrn_n_u16(q7u16, 7);
            d15u8 = vqrshrn_n_u16(q8u16, 7);
            d16u8 = vqrshrn_n_u16(q9u16, 7);
            d17u8 = vqrshrn_n_u16(q10u16, 7);
            d18u8 = vqrshrn_n_u16(q11u16, 7);
            d19u8 = vqrshrn_n_u16(q12u16, 7);
            d20u8 = vqrshrn_n_u16(q13u16, 7);
            d21u8 = vqrshrn_n_u16(q14u16, 7);

            d2u8 = vld1_u8(src_ptr);
            d3u8 = vld1_u8(src_ptr + 8);
            d4u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d5u8 = vld1_u8(src_ptr);
            d6u8 = vld1_u8(src_ptr + 8);
            d7u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d8u8 = vld1_u8(src_ptr);
            d9u8 = vld1_u8(src_ptr + 8);
            d10u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d11u8 = vld1_u8(src_ptr);
            d12u8 = vld1_u8(src_ptr + 8);
            d13u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;

            q7u8 = vcombine_u8(d14u8, d15u8);
            q8u8 = vcombine_u8(d16u8, d17u8);
            q9u8 = vcombine_u8(d18u8, d19u8);
            q10u8 = vcombine_u8(d20u8, d21u8);

            vst1q_u8((uint8_t *)tmpp, q7u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q8u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q9u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q10u8);
            tmpp += 16;
        }

        // First-pass filtering for rest 5 lines
        d14u8 = vld1_u8(src_ptr);
        d15u8 = vld1_u8(src_ptr + 8);
        d16u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;

        q9u16  = vmull_u8(d2u8, d0u8);
        q10u16 = vmull_u8(d3u8, d0u8);
        q11u16 = vmull_u8(d5u8, d0u8);
        q12u16 = vmull_u8(d6u8, d0u8);
        q13u16 = vmull_u8(d8u8, d0u8);
        q14u16 = vmull_u8(d9u8, d0u8);

        d2u8  = vext_u8(d2u8, d3u8, 1);
        d5u8  = vext_u8(d5u8, d6u8, 1);
        d8u8  = vext_u8(d8u8, d9u8, 1);

        q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
        q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
        q13u16 = vmlal_u8(q13u16, d8u8, d1u8);

        d3u8  = vext_u8(d3u8, d4u8, 1);
        d6u8  = vext_u8(d6u8, d7u8, 1);
        d9u8  = vext_u8(d9u8, d10u8, 1);

        q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
        q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
        q14u16 = vmlal_u8(q14u16, d9u8, d1u8);

        q1u16 = vmull_u8(d11u8, d0u8);
        q2u16 = vmull_u8(d12u8, d0u8);
        q3u16 = vmull_u8(d14u8, d0u8);
        q4u16 = vmull_u8(d15u8, d0u8);

        d11u8 = vext_u8(d11u8, d12u8, 1);
        d14u8 = vext_u8(d14u8, d15u8, 1);

        q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
        q3u16 = vmlal_u8(q3u16, d14u8, d1u8);

        d12u8 = vext_u8(d12u8, d13u8, 1);
        d15u8 = vext_u8(d15u8, d16u8, 1);

        q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
        q4u16 = vmlal_u8(q4u16, d15u8, d1u8);

        d10u8 = vqrshrn_n_u16(q9u16, 7);
        d11u8 = vqrshrn_n_u16(q10u16, 7);
        d12u8 = vqrshrn_n_u16(q11u16, 7);
        d13u8 = vqrshrn_n_u16(q12u16, 7);
        d14u8 = vqrshrn_n_u16(q13u16, 7);
        d15u8 = vqrshrn_n_u16(q14u16, 7);
        d16u8 = vqrshrn_n_u16(q1u16, 7);
        d17u8 = vqrshrn_n_u16(q2u16, 7);
        d18u8 = vqrshrn_n_u16(q3u16, 7);
        d19u8 = vqrshrn_n_u16(q4u16, 7);

        q5u8 = vcombine_u8(d10u8, d11u8);
        q6u8 = vcombine_u8(d12u8, d13u8);
        q7u8 = vcombine_u8(d14u8, d15u8);
        q8u8 = vcombine_u8(d16u8, d17u8);
        q9u8 = vcombine_u8(d18u8, d19u8);

        vst1q_u8((uint8_t *)tmpp, q5u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q6u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q7u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q8u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q9u8);

        // secondpass_filter
        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);

        tmpp = tmp;
        tmpp2 = tmpp + 272;
        q11u8 = vld1q_u8(tmpp);
        tmpp += 16;
        for (i = 4; i > 0; i--) {
            q12u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q13u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q14u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q15u8 = vld1q_u8(tmpp);
            tmpp += 16;

            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);

            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);

            d2u8 = vqrshrn_n_u16(q1u16, 7);
            d3u8 = vqrshrn_n_u16(q2u16, 7);
            d4u8 = vqrshrn_n_u16(q3u16, 7);
            d5u8 = vqrshrn_n_u16(q4u16, 7);
            d6u8 = vqrshrn_n_u16(q5u16, 7);
            d7u8 = vqrshrn_n_u16(q6u16, 7);
            d8u8 = vqrshrn_n_u16(q7u16, 7);
            d9u8 = vqrshrn_n_u16(q8u16, 7);

            q1u8 = vcombine_u8(d2u8, d3u8);
            q2u8 = vcombine_u8(d4u8, d5u8);
            q3u8 = vcombine_u8(d6u8, d7u8);
            q4u8 = vcombine_u8(d8u8, d9u8);

            q11u8 = q15u8;

            vst1q_u8((uint8_t *)tmpp2, q1u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q2u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q3u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q4u8);
            tmpp2 += 16;
        }
    }

    // sub_pixel_variance16x16_neon
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);

    tmpp = tmp + 272;
    for (i = 0; i < 8; i++) {  // sub_pixel_variance16x16_neon_loop
        q0u8 = vld1q_u8(tmpp);
        tmpp += 16;
        q1u8 = vld1q_u8(tmpp);
        tmpp += 16;
        q2u8 = vld1q_u8(dst_ptr);
        dst_ptr += dst_pixels_per_line;
        q3u8 = vld1q_u8(dst_ptr);
        dst_ptr += dst_pixels_per_line;

        d0u8 = vget_low_u8(q0u8);
        d1u8 = vget_high_u8(q0u8);
        d2u8 = vget_low_u8(q1u8);
        d3u8 = vget_high_u8(q1u8);

        q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
        q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
        q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
        q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));

        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);

        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);

        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);

        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }

    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);

    d0s64 = vget_low_s64(q0s64);
    d1s64 = vget_high_s64(q0s64);
    d2s64 = vget_low_s64(q1s64);
    d3s64 = vget_high_s64(q1s64);
    d0s64 = vadd_s64(d0s64, d1s64);
    d1s64 = vadd_s64(d2s64, d3s64);

    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

    return vget_lane_u32(d0u32, 0);
}
void SkBlitLCD16OpaqueRow_neon(SkPMColor dst[], const uint16_t src[],
                                        SkColor color, int width,
                                        SkPMColor opaqueDst) {
    int colR = SkColorGetR(color);
    int colG = SkColorGetG(color);
    int colB = SkColorGetB(color);

    uint8x8_t vcolR, vcolG, vcolB;
    uint8x8_t vopqDstA, vopqDstR, vopqDstG, vopqDstB;

    if (width >= 8) {
        vcolR = vdup_n_u8(colR);
        vcolG = vdup_n_u8(colG);
        vcolB = vdup_n_u8(colB);
        vopqDstA = vdup_n_u8(SkGetPackedA32(opaqueDst));
        vopqDstR = vdup_n_u8(SkGetPackedR32(opaqueDst));
        vopqDstG = vdup_n_u8(SkGetPackedG32(opaqueDst));
        vopqDstB = vdup_n_u8(SkGetPackedB32(opaqueDst));
    }

    while (width >= 8) {
        uint8x8x4_t vdst;
        uint16x8_t vmask;
        uint16x8_t vmaskR, vmaskG, vmaskB;
        uint8x8_t vsel_trans, vsel_opq;

        vdst = vld4_u8((uint8_t*)dst);
        vmask = vld1q_u16(src);

        // Prepare compare masks
        vsel_trans = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0)));
        vsel_opq = vmovn_u16(vceqq_u16(vmask, vdupq_n_u16(0xFFFF)));

        // Get all the color masks on 5 bits
        vmaskR = vshrq_n_u16(vmask, SK_R16_SHIFT);
        vmaskG = vshrq_n_u16(vshlq_n_u16(vmask, SK_R16_BITS),
                             SK_B16_BITS + SK_R16_BITS + 1);
        vmaskB = vmask & vdupq_n_u16(SK_B16_MASK);

        // Upscale to 0..32
        vmaskR = vmaskR + vshrq_n_u16(vmaskR, 4);
        vmaskG = vmaskG + vshrq_n_u16(vmaskG, 4);
        vmaskB = vmaskB + vshrq_n_u16(vmaskB, 4);

        vdst.val[NEON_A] = vbsl_u8(vsel_trans, vdst.val[NEON_A], vdup_n_u8(0xFF));
        vdst.val[NEON_A] = vbsl_u8(vsel_opq, vopqDstA, vdst.val[NEON_A]);

        vdst.val[NEON_R] = SkBlend32_neon8(vcolR, vdst.val[NEON_R], vmaskR);
        vdst.val[NEON_G] = SkBlend32_neon8(vcolG, vdst.val[NEON_G], vmaskG);
        vdst.val[NEON_B] = SkBlend32_neon8(vcolB, vdst.val[NEON_B], vmaskB);

        vdst.val[NEON_R] = vbsl_u8(vsel_opq, vopqDstR, vdst.val[NEON_R]);
        vdst.val[NEON_G] = vbsl_u8(vsel_opq, vopqDstG, vdst.val[NEON_G]);
        vdst.val[NEON_B] = vbsl_u8(vsel_opq, vopqDstB, vdst.val[NEON_B]);

        vst4_u8((uint8_t*)dst, vdst);

        dst += 8;
        src += 8;
        width -= 8;
    }

    // Leftovers
    for (int i = 0; i < width; i++) {
        dst[i] = SkBlendLCD16Opaque(colR, colG, colB, dst[i], src[i],
                                    opaqueDst);
    }
}
Example #22
0
uint8x8_t test_vdup_n_u8(uint8_t v1) {
  // CHECK: test_vdup_n_u8
  return vdup_n_u8(v1);
  // CHECK: dup {{v[0-9]+}}.8b, {{w[0-9]+}}
}
static INLINE void mbloop_filter_neon(uint8x8_t dblimit,   // mblimit
                                      uint8x8_t dlimit,    // limit
                                      uint8x8_t dthresh,   // thresh
                                      uint8x8_t d3u8,      // p2
                                      uint8x8_t d4u8,      // p2
                                      uint8x8_t d5u8,      // p1
                                      uint8x8_t d6u8,      // p0
                                      uint8x8_t d7u8,      // q0
                                      uint8x8_t d16u8,     // q1
                                      uint8x8_t d17u8,     // q2
                                      uint8x8_t d18u8,     // q3
                                      uint8x8_t *d0ru8,    // p1
                                      uint8x8_t *d1ru8,    // p1
                                      uint8x8_t *d2ru8,    // p0
                                      uint8x8_t *d3ru8,    // q0
                                      uint8x8_t *d4ru8,    // q1
                                      uint8x8_t *d5ru8) {  // q1
  uint32_t flat;
  uint8x8_t d0u8, d1u8, d2u8, d19u8, d20u8, d21u8, d22u8, d23u8, d24u8;
  uint8x8_t d25u8, d26u8, d27u8, d28u8, d29u8, d30u8, d31u8;
  int16x8_t q15s16;
  uint16x8_t q10u16, q14u16;
  int8x8_t d21s8, d24s8, d25s8, d26s8, d28s8, d29s8, d30s8;

  d19u8 = vabd_u8(d3u8, d4u8);
  d20u8 = vabd_u8(d4u8, d5u8);
  d21u8 = vabd_u8(d5u8, d6u8);
  d22u8 = vabd_u8(d16u8, d7u8);
  d23u8 = vabd_u8(d17u8, d16u8);
  d24u8 = vabd_u8(d18u8, d17u8);

  d19u8 = vmax_u8(d19u8, d20u8);
  d20u8 = vmax_u8(d21u8, d22u8);

  d25u8 = vabd_u8(d6u8, d4u8);

  d23u8 = vmax_u8(d23u8, d24u8);

  d26u8 = vabd_u8(d7u8, d17u8);

  d19u8 = vmax_u8(d19u8, d20u8);

  d24u8 = vabd_u8(d6u8, d7u8);
  d27u8 = vabd_u8(d3u8, d6u8);
  d28u8 = vabd_u8(d18u8, d7u8);

  d19u8 = vmax_u8(d19u8, d23u8);

  d23u8 = vabd_u8(d5u8, d16u8);
  d24u8 = vqadd_u8(d24u8, d24u8);

  d19u8 = vcge_u8(dlimit, d19u8);

  d25u8 = vmax_u8(d25u8, d26u8);
  d26u8 = vmax_u8(d27u8, d28u8);

  d23u8 = vshr_n_u8(d23u8, 1);

  d25u8 = vmax_u8(d25u8, d26u8);

  d24u8 = vqadd_u8(d24u8, d23u8);

  d20u8 = vmax_u8(d20u8, d25u8);

  d23u8 = vdup_n_u8(1);
  d24u8 = vcge_u8(dblimit, d24u8);

  d21u8 = vcgt_u8(d21u8, dthresh);

  d20u8 = vcge_u8(d23u8, d20u8);

  d19u8 = vand_u8(d19u8, d24u8);

  d23u8 = vcgt_u8(d22u8, dthresh);

  d20u8 = vand_u8(d20u8, d19u8);

  d22u8 = vdup_n_u8(0x80);

  d23u8 = vorr_u8(d21u8, d23u8);

  q10u16 = vcombine_u16(vreinterpret_u16_u8(d20u8), vreinterpret_u16_u8(d21u8));

  d30u8 = vshrn_n_u16(q10u16, 4);
  flat = vget_lane_u32(vreinterpret_u32_u8(d30u8), 0);

  if (flat == 0xffffffff) {  // Check for all 1's, power_branch_only
    d27u8 = vdup_n_u8(3);
    d21u8 = vdup_n_u8(2);
    q14u16 = vaddl_u8(d6u8, d7u8);
    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
    q14u16 = vmlal_u8(q14u16, d4u8, d21u8);
    q14u16 = vaddw_u8(q14u16, d5u8);
    *d0ru8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vaddw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d16u8);
    *d1ru8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d17u8);
    *d2ru8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d7u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    *d3ru8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vsubw_u8(q14u16, d7u8);
    q14u16 = vaddw_u8(q14u16, d16u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    *d4ru8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vsubw_u8(q14u16, d16u8);
    q14u16 = vaddw_u8(q14u16, d17u8);
    q14u16 = vaddw_u8(q14u16, d18u8);
    *d5ru8 = vqrshrn_n_u16(q14u16, 3);
  } else {
    d21u8 = veor_u8(d7u8, d22u8);
    d24u8 = veor_u8(d6u8, d22u8);
    d25u8 = veor_u8(d5u8, d22u8);
    d26u8 = veor_u8(d16u8, d22u8);

    d27u8 = vdup_n_u8(3);

    d28s8 = vsub_s8(vreinterpret_s8_u8(d21u8), vreinterpret_s8_u8(d24u8));
    d29s8 = vqsub_s8(vreinterpret_s8_u8(d25u8), vreinterpret_s8_u8(d26u8));

    q15s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d27u8));

    d29s8 = vand_s8(d29s8, vreinterpret_s8_u8(d23u8));

    q15s16 = vaddw_s8(q15s16, d29s8);

    d29u8 = vdup_n_u8(4);

    d28s8 = vqmovn_s16(q15s16);

    d28s8 = vand_s8(d28s8, vreinterpret_s8_u8(d19u8));

    d30s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d27u8));
    d29s8 = vqadd_s8(d28s8, vreinterpret_s8_u8(d29u8));
    d30s8 = vshr_n_s8(d30s8, 3);
    d29s8 = vshr_n_s8(d29s8, 3);

    d24s8 = vqadd_s8(vreinterpret_s8_u8(d24u8), d30s8);
    d21s8 = vqsub_s8(vreinterpret_s8_u8(d21u8), d29s8);

    d29s8 = vrshr_n_s8(d29s8, 1);
    d29s8 = vbic_s8(d29s8, vreinterpret_s8_u8(d23u8));

    d25s8 = vqadd_s8(vreinterpret_s8_u8(d25u8), d29s8);
    d26s8 = vqsub_s8(vreinterpret_s8_u8(d26u8), d29s8);

    if (flat == 0) {  // filter_branch_only
      *d0ru8 = d4u8;
      *d1ru8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
      *d2ru8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
      *d3ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
      *d4ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);
      *d5ru8 = d17u8;
      return;
    }

    d21u8 = veor_u8(vreinterpret_u8_s8(d21s8), d22u8);
    d24u8 = veor_u8(vreinterpret_u8_s8(d24s8), d22u8);
    d25u8 = veor_u8(vreinterpret_u8_s8(d25s8), d22u8);
    d26u8 = veor_u8(vreinterpret_u8_s8(d26s8), d22u8);

    d23u8 = vdup_n_u8(2);
    q14u16 = vaddl_u8(d6u8, d7u8);
    q14u16 = vmlal_u8(q14u16, d3u8, d27u8);
    q14u16 = vmlal_u8(q14u16, d4u8, d23u8);

    d0u8 = vbsl_u8(d20u8, dblimit, d4u8);

    q14u16 = vaddw_u8(q14u16, d5u8);

    d1u8 = vbsl_u8(d20u8, dlimit, d25u8);

    d30u8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vaddw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d16u8);

    d2u8 = vbsl_u8(d20u8, dthresh, d24u8);

    d31u8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vaddw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d17u8);

    *d0ru8 = vbsl_u8(d20u8, d30u8, d0u8);

    d23u8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d3u8);
    q14u16 = vsubw_u8(q14u16, d6u8);
    q14u16 = vaddw_u8(q14u16, d7u8);

    *d1ru8 = vbsl_u8(d20u8, d31u8, d1u8);

    q14u16 = vaddw_u8(q14u16, d18u8);

    *d2ru8 = vbsl_u8(d20u8, d23u8, d2u8);

    d22u8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d4u8);
    q14u16 = vsubw_u8(q14u16, d7u8);
    q14u16 = vaddw_u8(q14u16, d16u8);

    d3u8 = vbsl_u8(d20u8, d3u8, d21u8);

    q14u16 = vaddw_u8(q14u16, d18u8);

    d4u8 = vbsl_u8(d20u8, d4u8, d26u8);

    d6u8 = vqrshrn_n_u16(q14u16, 3);

    q14u16 = vsubw_u8(q14u16, d5u8);
    q14u16 = vsubw_u8(q14u16, d16u8);
    q14u16 = vaddw_u8(q14u16, d17u8);
    q14u16 = vaddw_u8(q14u16, d18u8);

    d5u8 = vbsl_u8(d20u8, d5u8, d17u8);

    d7u8 = vqrshrn_n_u16(q14u16, 3);

    *d3ru8 = vbsl_u8(d20u8, d22u8, d3u8);
    *d4ru8 = vbsl_u8(d20u8, d6u8, d4u8);
    *d5ru8 = vbsl_u8(d20u8, d7u8, d5u8);
  }
  return;
}
void
png_read_filter_row_paeth3_neon(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_bytep rp = row;
   png_const_bytep pp = prev_row;
   png_bytep rp_stop = row + row_info->rowbytes;

   uint8x16_t vtmp;
   uint8x8x2_t *vrpt;
   uint8x8x2_t vrp;
   uint8x8_t vlast = vdup_n_u8(0);
   uint8x8x4_t vdest;
   vdest.val[3] = vdup_n_u8(0);

   vtmp = vld1q_u8(rp);
   vrpt = png_ptr(uint8x8x2_t,&vtmp);
   vrp = *vrpt;

   png_debug(1, "in png_read_filter_row_paeth3_neon");

   for (; rp < rp_stop; pp += 12)
   {
      uint8x8x2_t *vppt;
      uint8x8x2_t vpp;
      uint8x8_t vtmp1, vtmp2, vtmp3;
      uint32x2_t *temp_pointer;

      vtmp = vld1q_u8(pp);
      vppt = png_ptr(uint8x8x2_t,&vtmp);
      vpp = *vppt;

      vdest.val[0] = paeth(vdest.val[3], vpp.val[0], vlast);
      vdest.val[0] = vadd_u8(vdest.val[0], vrp.val[0]);

      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 3);
      vtmp2 = vext_u8(vpp.val[0], vpp.val[1], 3);
      vdest.val[1] = paeth(vdest.val[0], vtmp2, vpp.val[0]);
      vdest.val[1] = vadd_u8(vdest.val[1], vtmp1);

      vtmp1 = vext_u8(vrp.val[0], vrp.val[1], 6);
      vtmp3 = vext_u8(vpp.val[0], vpp.val[1], 6);
      vdest.val[2] = paeth(vdest.val[1], vtmp3, vtmp2);
      vdest.val[2] = vadd_u8(vdest.val[2], vtmp1);

      vtmp1 = vext_u8(vrp.val[1], vrp.val[1], 1);
      vtmp2 = vext_u8(vpp.val[1], vpp.val[1], 1);

      vtmp = vld1q_u8(rp + 12);
      vrpt = png_ptr(uint8x8x2_t,&vtmp);
      vrp = *vrpt;

      vdest.val[3] = paeth(vdest.val[2], vtmp2, vtmp3);
      vdest.val[3] = vadd_u8(vdest.val[3], vtmp1);

      vlast = vtmp2;

      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[0]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[1]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[2]), 0);
      rp += 3;
      vst1_lane_u32(png_ptr(uint32_t,rp), png_ldr(uint32x2_t,&vdest.val[3]), 0);
      rp += 3;
   }
}
Example #25
0
static INLINE void vp9_loop_filter_neon(
        uint8x8_t dblimit,    // flimit
        uint8x8_t dlimit,     // limit
        uint8x8_t dthresh,    // thresh
        uint8x8_t d3u8,       // p3
        uint8x8_t d4u8,       // p2
        uint8x8_t d5u8,       // p1
        uint8x8_t d6u8,       // p0
        uint8x8_t d7u8,       // q0
        uint8x8_t d16u8,      // q1
        uint8x8_t d17u8,      // q2
        uint8x8_t d18u8,      // q3
        uint8x8_t *d4ru8,     // p1
        uint8x8_t *d5ru8,     // p0
        uint8x8_t *d6ru8,     // q0
        uint8x8_t *d7ru8) {   // q1
    uint8x8_t d19u8, d20u8, d21u8, d22u8, d23u8, d27u8, d28u8;
    int16x8_t q12s16;
    int8x8_t d19s8, d20s8, d21s8, d26s8, d27s8, d28s8;

    d19u8 = vabd_u8(d3u8, d4u8);
    d20u8 = vabd_u8(d4u8, d5u8);
    d21u8 = vabd_u8(d5u8, d6u8);
    d22u8 = vabd_u8(d16u8, d7u8);
    d3u8  = vabd_u8(d17u8, d16u8);
    d4u8  = vabd_u8(d18u8, d17u8);

    d19u8 = vmax_u8(d19u8, d20u8);
    d20u8 = vmax_u8(d21u8, d22u8);
    d3u8  = vmax_u8(d3u8,  d4u8);
    d23u8 = vmax_u8(d19u8, d20u8);

    d17u8 = vabd_u8(d6u8, d7u8);

    d21u8 = vcgt_u8(d21u8, dthresh);
    d22u8 = vcgt_u8(d22u8, dthresh);
    d23u8 = vmax_u8(d23u8, d3u8);

    d28u8 = vabd_u8(d5u8, d16u8);
    d17u8 = vqadd_u8(d17u8, d17u8);

    d23u8 = vcge_u8(dlimit, d23u8);

    d18u8 = vdup_n_u8(0x80);
    d5u8  = veor_u8(d5u8,  d18u8);
    d6u8  = veor_u8(d6u8,  d18u8);
    d7u8  = veor_u8(d7u8,  d18u8);
    d16u8 = veor_u8(d16u8, d18u8);

    d28u8 = vshr_n_u8(d28u8, 1);
    d17u8 = vqadd_u8(d17u8, d28u8);

    d19u8 = vdup_n_u8(3);

    d28s8 = vsub_s8(vreinterpret_s8_u8(d7u8),
                    vreinterpret_s8_u8(d6u8));

    d17u8 = vcge_u8(dblimit, d17u8);

    d27s8 = vqsub_s8(vreinterpret_s8_u8(d5u8),
                     vreinterpret_s8_u8(d16u8));

    d22u8 = vorr_u8(d21u8, d22u8);

    q12s16 = vmull_s8(d28s8, vreinterpret_s8_u8(d19u8));

    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d22u8);
    d23u8 = vand_u8(d23u8, d17u8);

    q12s16 = vaddw_s8(q12s16, vreinterpret_s8_u8(d27u8));

    d17u8 = vdup_n_u8(4);

    d27s8 = vqmovn_s16(q12s16);
    d27u8 = vand_u8(vreinterpret_u8_s8(d27s8), d23u8);
    d27s8 = vreinterpret_s8_u8(d27u8);

    d28s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d19u8));
    d27s8 = vqadd_s8(d27s8, vreinterpret_s8_u8(d17u8));
    d28s8 = vshr_n_s8(d28s8, 3);
    d27s8 = vshr_n_s8(d27s8, 3);

    d19s8 = vqadd_s8(vreinterpret_s8_u8(d6u8), d28s8);
    d26s8 = vqsub_s8(vreinterpret_s8_u8(d7u8), d27s8);

    d27s8 = vrshr_n_s8(d27s8, 1);
    d27s8 = vbic_s8(d27s8, vreinterpret_s8_u8(d22u8));

    d21s8 = vqadd_s8(vreinterpret_s8_u8(d5u8), d27s8);
    d20s8 = vqsub_s8(vreinterpret_s8_u8(d16u8), d27s8);

    *d4ru8 = veor_u8(vreinterpret_u8_s8(d21s8), d18u8);
    *d5ru8 = veor_u8(vreinterpret_u8_s8(d19s8), d18u8);
    *d6ru8 = veor_u8(vreinterpret_u8_s8(d26s8), d18u8);
    *d7ru8 = veor_u8(vreinterpret_u8_s8(d20s8), d18u8);
    return;
}
Example #26
0
bool decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height, unsigned char fill_alpha=0xff)
{
    // pre-condition : width, height must be even
    if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv)
        return false;

    // in & out pointers
    unsigned char* dst = out;

    // constants
    int const stride = width*trait::bytes_per_pixel;
    int const itHeight = height>>1;
    int const itWidth = width>>3;

    uint8x8_t const Yshift = vdup_n_u8(16);
    int16x8_t const half = vdupq_n_u16(128);
    int32x4_t const rounding = vdupq_n_s32(128);

    // tmp variable
    uint16x8_t t;

    // pixel block to temporary store 8 pixels
    typename trait::PixelBlock pblock = trait::init_pixelblock(fill_alpha);    

    for (int j=0; j<itHeight; ++j, y+=width, dst+=stride) {
        for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*trait::bytes_per_pixel)) {
            t = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift));
            int32x4_t const Y00 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298);
            int32x4_t const Y01 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298);

            t = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift));
            int32x4_t const Y10 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298);
            int32x4_t const Y11 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298);

            // trait::loadvu pack 4 sets of uv into a uint8x8_t, layout : { v0,u0, v1,u1, v2,u2, v3,u3 }
            t = vsubq_s16((int16x8_t)vmovl_u8(trait::loadvu(uv)), half);

            // UV.val[0] : v0, v1, v2, v3
            // UV.val[1] : u0, u1, u2, u3
            int16x4x2_t const UV = vuzp_s16(vget_low_s16(t), vget_high_s16(t));

            // tR : 128+409V
            // tG : 128-100U-208V
            // tB : 128+516U
            int32x4_t const tR = vmlal_n_s16(rounding, UV.val[0], 409);
            int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, UV.val[0], -208), UV.val[1], -100);
            int32x4_t const tB = vmlal_n_s16(rounding, UV.val[1], 516);

            int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3]
            int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3]
            int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3]

            // upper 8 pixels
            trait::store_pixel_block(dst, pblock,
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8));

            // lower 8 pixels
            trait::store_pixel_block(dst+stride, pblock,
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8));
        }
    }
    return true;
}
Example #27
0
 static PixelBlock const init_pixelblock(unsigned char fill_alpha) {
     PixelBlock block;
     block.val[3] = vdup_n_u8(fill_alpha); // alpha channel in the last
     return block;
 }
Example #28
0
inline   uint8x8_t vdup_n(const u8  & val) { return  vdup_n_u8(val); }