C++ (Cpp) vdup_lane_u8示例

示例#1

0

显示文件

文件： vp9_reconintra_neon.c 项目： MekliCZ/positron

void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint64x1_t d1u64 = vdup_n_u64(0);
  (void)above;

  d1u64 = vld1_u64((const uint64_t *)left);

  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
  vst1_u8(dst, d0u8);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
  vst1_u8(dst, d0u8);
}

示例#2

0

显示文件

文件： vdup_laneu8.c 项目： 0day-ci/gcc

void test_vdup_laneu8 (void)
{
  uint8x8_t out_uint8x8_t;
  uint8x8_t arg0_uint8x8_t;

  out_uint8x8_t = vdup_lane_u8 (arg0_uint8x8_t, 1);
}

示例#3

0

显示文件

文件： vp9_reconintra_neon.c 项目： MekliCZ/positron

void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
                              const uint8_t *above, const uint8_t *left) {
  uint8x8_t d0u8 = vdup_n_u8(0);
  uint32x2_t d1u32 = vdup_n_u32(0);
  (void)above;

  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);

  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
  dst += stride;
  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
}

示例#4

0

显示文件

文件： vp9_reconintra_neon.c 项目： MekliCZ/positron

// 'do_above' and 'do_left' facilitate branch removal when inlined.
static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride,
                          const uint8_t *above, const uint8_t *left,
                          int do_above, int do_left) {
  uint16x8_t sum_top;
  uint16x8_t sum_left;
  uint8x8_t dc0;

  if (do_above) {
    const uint8x8_t A = vld1_u8(above);  // top row
    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    sum_top = vcombine_u16(p2, p2);
  }

  if (do_left) {
    const uint8x8_t L = vld1_u8(left);  // left border
    const uint16x4_t p0 = vpaddl_u8(L);  // cascading summation of the left
    const uint16x4_t p1 = vpadd_u16(p0, p0);
    const uint16x4_t p2 = vpadd_u16(p1, p1);
    sum_left = vcombine_u16(p2, p2);
  }

  if (do_above && do_left) {
    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
    dc0 = vrshrn_n_u16(sum, 4);
  } else if (do_above) {
    dc0 = vrshrn_n_u16(sum_top, 3);
  } else if (do_left) {
    dc0 = vrshrn_n_u16(sum_left, 3);
  } else {
    dc0 = vdup_n_u8(0x80);
  }

  {
    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
    int i;
    for (i = 0; i < 8; ++i) {
      vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc));
    }
  }
}

示例#5

0

显示文件

文件： aarch64-neon-copy.c 项目： Xmister/clang-onex

uint8x8_t test_vdup_lane_u8(uint8x8_t v1) {
  // CHECK: test_vdup_lane_u8
  return vdup_lane_u8(v1, 5);
  // CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5]
}

示例#6

0

显示文件

文件： neon.c 项目： yaojingguo/gcc-intrinsics-samplecode

void  yuv422rgb_neon_int(const unsigned char * sourcep, int source_byte_count,
			 unsigned char * destp)
{
  const unsigned char *source_endp;
  const unsigned char *vector_endp;
  int remainder;
  const int16x8_t u_coeff = {0, -22, 113, 0, 0, -22, 113, 0};
  const int16x8_t v_coeff = {90, -46, 0,  0, 90, -46, 0,  0};
  const uint8x8_t zeroalpha = {0x0, 0x0, 0x0, 0xFF, 0x0, 0x0, 0x0, 0xFF};
  const int16x8_t uvbias = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; 
  int16x8_t mp0_rgba;  /* macropixel 0's resulting RGBA RGBA pixels  */
  int16x8_t mp1_rgba; /* macropixel 1's resulting RGBA RGBA pixels  */
  uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1}   */
  uint8x8_t rgba0, rgba1; /* rgba values as bytes  */
  uint8x16_t bothrgba;
  uint8_t * destinationp; /* pointer into output buffer destp  */
  int16x8_t widerpixels; /*  rawpixels promoted to shorts per component */
  const uint8x8_t yselect = {0xff, 0xff, 0xff, 0xff,
			     0x00, 0x00, 0x00, 0x00};
  
  
  /* we're working with things in 4-byte macropixels  */
  remainder = source_byte_count % 4;

  source_endp = sourcep + source_byte_count;
  vector_endp = source_endp - remainder;
  destinationp = (uint8_t *)destp;

  while (sourcep < vector_endp)
    {
     /* pull YUYV from 2 four byte macropixels starting at sourcep. */
      /* we'll increment sourcep as we go to save the array dereference */
      /* and separate increment instruction at the end of the loop  */

      /* load rawpixels with {[YUYV]0 [YUYV]1 } with byte components */
      rawpixels = vld1_u8(sourcep);
      sourcep += sizeof(rawpixels);

      widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels));
 


      
      /* ---------- process macropixel 0 --------------- */
      /* take macropixel zero ([YUYV]0) from rawpixels and   */
      /* compute the two RGBA pixels that come from it. store  */
      /* those two pixels in mp0_rgba  */
      {
	int16x8_t wider_yalpha;
	int16x8_t u_vec, v_vec, uv_vec;
	uint8x8_t narrow_yalpha;
	uint8x8_t y0_vec, y1_vec;
	int16x4_t yuyv;

	/* narrow_yalpha is drawn from [YUYV]0 and formed into */
	/* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha}   */
	/* this would have been a nice place for vtbx1_u8, but i  */
	/* can't get it to work. so i'll have to use vbsl_u8 instead.  */

	y0_vec = vdup_lane_u8(rawpixels, MP0_Y0);
	y1_vec = vdup_lane_u8(rawpixels, MP0_Y1);
	narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec);

	/* store ALPHA in elements 3 and 7 (after the RGB components)  */
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 3);
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 7);

	/* use vmovl_u8 to go from being unsigned 8-bit to  */
	/* unsigned 16-bit, the use vreinterpretq_s16_u16 to  */
	/* change interpretation from unsigned 16-bit to signed  */
	/* 16-bit.   */
	wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha));

	yuyv = vget_low_s16(widerpixels);
	
	/* form a vector of the U component from MP0  */
	u_vec = vdupq_lane_s16(yuyv, MP0_U);
	
	/* subtract uvbias from u_vec */
	u_vec = vsubq_s16(u_vec, uvbias);

	/* form a vector of the V component from MP0  */
	v_vec = vdupq_lane_s16(yuyv, MP0_V);
	
	/* subtract uvbias from v_vec */
	v_vec = vsubq_s16(v_vec, uvbias);

		
	/* Multiply eight 16-bit values in u_vec by eight 16-bit */
	/* values in u_coeff and store the results in u_vec.  */


	u_vec = vmulq_s16(u_vec, u_coeff);

	/* likewise multiply eight 16-bit values in v_vec by   */
	/* v_coeff and store the results in  v_vec */
	
	v_vec = vmulq_s16(v_vec, v_coeff);

	/* form uv_vec as the sum of u_vec & v_vec, then shift 6 places   */
	/* (dividing by 64)  */
	uv_vec = vaddq_s16(u_vec, v_vec);
	  
	uv_vec = vshrq_n_s16(uv_vec, 6);

	/* now mp0_rgba = y_vec + u_vec + v_vec  */
	mp0_rgba = vaddq_s16(wider_yalpha, uv_vec);

      }

      /* ---------- process macropixel 1 --------------- */
      /* take macropixel one ([YUYV]1) from rawpixels and   */
      /* compute the two RGBA pixels that come from it. store  */
      /* those two pixels in mp1_rgba  */      
      {
	int16x8_t wider_yalpha;
	int16x8_t u_vec, v_vec, uv_vec;
	uint8x8_t narrow_yalpha;
	uint8x8_t y0_vec, y1_vec;
	int16x4_t yuyv;

	/* narrow_yalpha is drawn from [YUYV]1 and formed into */
	/* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha}   */
	/* this would have been a nice place for vtbx1_u8, but i  */
	/* can't get it to work. so i'll have to use vbsl_u8 instead.  */

	y0_vec = vdup_lane_u8(rawpixels, MP1_Y0);
	y1_vec = vdup_lane_u8(rawpixels, MP1_Y1);
	narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec);
	  
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 3);
	narrow_yalpha =  vset_lane_u8(ALPHA, narrow_yalpha, 7);

	/* use vmovl_u8 to go from being unsigned 8-bit to  */
	/* unsigned 16-bit, the use vreinterpretq_s16_u16 to  */


	wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha));

	yuyv = vget_high_s16(widerpixels);
	u_vec = vdupq_lane_s16(yuyv, 1);
	u_vec = vsubq_s16(u_vec, uvbias);
	
	v_vec = vdupq_lane_s16(yuyv, 3);
	v_vec = vsubq_s16(v_vec, uvbias);

		
	/* Multiply eight 16-bit values in u_vec by eight 16-bit */
	/* values in u_coeff and store the results in u_vec.  */


	u_vec = vmulq_s16(u_vec, u_coeff);

	/* likewise multiply eight 16-bit values in v_vec by   */
	/* v_coeff and store the results in  v_vec */
	
	v_vec = vmulq_s16(v_vec, v_coeff);
     
	/* form uv_vec as the sum of u_vec & v_vec, then shift 6 places   */
	/* (dividing by 64)  */
	uv_vec  = vaddq_s16(u_vec, v_vec);
	uv_vec = vshrq_n_s16(uv_vec, 6);


	/* now mp1_rgba = y_vec + u_vec + v_vec  */
	mp1_rgba = vaddq_s16(wider_yalpha, uv_vec);
      }
      

      /* turn mp0_rgba from a vector of shorts to a vector of  */
      /* unsigned unsigned chars. this will saturate: clipping  */
      /* the values between 0 and 255.   */
      
      rgba0 = vqmovun_s16(mp0_rgba);
      rgba1 = vqmovun_s16(mp1_rgba);

      /* make it faster to copy these back out of vector registers into  */
      /* memory by combining rgba0 and rgba1 into the larger bothrgba.   */
      /* then store that back into memory at destinationp.               */

      bothrgba = vcombine_u8(rgba0, rgba1);
      
      vst1q_u8(destinationp, bothrgba);
      destinationp += 16;
      
      
    }
}