void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x8_t d0u8 = vdup_n_u8(0); uint64x1_t d1u64 = vdup_n_u64(0); (void)above; d1u64 = vld1_u64((const uint64_t *)left); d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); vst1_u8(dst, d0u8); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); vst1_u8(dst, d0u8); }
void test_vdup_laneu8 (void) { uint8x8_t out_uint8x8_t; uint8x8_t arg0_uint8x8_t; out_uint8x8_t = vdup_lane_u8 (arg0_uint8x8_t, 1); }
void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { uint8x8_t d0u8 = vdup_n_u8(0); uint32x2_t d1u32 = vdup_n_u32(0); (void)above; d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); dst += stride; d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); }
// 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { uint16x8_t sum_top; uint16x8_t sum_left; uint8x8_t dc0; if (do_above) { const uint8x8_t A = vld1_u8(above); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_top = vcombine_u16(p2, p2); } if (do_left) { const uint8x8_t L = vld1_u8(left); // left border const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left const uint16x4_t p1 = vpadd_u16(p0, p0); const uint16x4_t p2 = vpadd_u16(p1, p1); sum_left = vcombine_u16(p2, p2); } if (do_above && do_left) { const uint16x8_t sum = vaddq_u16(sum_left, sum_top); dc0 = vrshrn_n_u16(sum, 4); } else if (do_above) { dc0 = vrshrn_n_u16(sum_top, 3); } else if (do_left) { dc0 = vrshrn_n_u16(sum_left, 3); } else { dc0 = vdup_n_u8(0x80); } { const uint8x8_t dc = vdup_lane_u8(dc0, 0); int i; for (i = 0; i < 8; ++i) { vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); } } }
uint8x8_t test_vdup_lane_u8(uint8x8_t v1) { // CHECK: test_vdup_lane_u8 return vdup_lane_u8(v1, 5); // CHECK: dup {{v[0-9]+}}.8b, {{v[0-9]+}}.b[5] }
void yuv422rgb_neon_int(const unsigned char * sourcep, int source_byte_count, unsigned char * destp) { const unsigned char *source_endp; const unsigned char *vector_endp; int remainder; const int16x8_t u_coeff = {0, -22, 113, 0, 0, -22, 113, 0}; const int16x8_t v_coeff = {90, -46, 0, 0, 90, -46, 0, 0}; const uint8x8_t zeroalpha = {0x0, 0x0, 0x0, 0xFF, 0x0, 0x0, 0x0, 0xFF}; const int16x8_t uvbias = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; int16x8_t mp0_rgba; /* macropixel 0's resulting RGBA RGBA pixels */ int16x8_t mp1_rgba; /* macropixel 1's resulting RGBA RGBA pixels */ uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1} */ uint8x8_t rgba0, rgba1; /* rgba values as bytes */ uint8x16_t bothrgba; uint8_t * destinationp; /* pointer into output buffer destp */ int16x8_t widerpixels; /* rawpixels promoted to shorts per component */ const uint8x8_t yselect = {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; /* we're working with things in 4-byte macropixels */ remainder = source_byte_count % 4; source_endp = sourcep + source_byte_count; vector_endp = source_endp - remainder; destinationp = (uint8_t *)destp; while (sourcep < vector_endp) { /* pull YUYV from 2 four byte macropixels starting at sourcep. */ /* we'll increment sourcep as we go to save the array dereference */ /* and separate increment instruction at the end of the loop */ /* load rawpixels with {[YUYV]0 [YUYV]1 } with byte components */ rawpixels = vld1_u8(sourcep); sourcep += sizeof(rawpixels); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* ---------- process macropixel 0 --------------- */ /* take macropixel zero ([YUYV]0) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp0_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]0 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP0_Y0); y1_vec = vdup_lane_u8(rawpixels, MP0_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); /* store ALPHA in elements 3 and 7 (after the RGB components) */ narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ /* change interpretation from unsigned 16-bit to signed */ /* 16-bit. */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_low_s16(widerpixels); /* form a vector of the U component from MP0 */ u_vec = vdupq_lane_s16(yuyv, MP0_U); /* subtract uvbias from u_vec */ u_vec = vsubq_s16(u_vec, uvbias); /* form a vector of the V component from MP0 */ v_vec = vdupq_lane_s16(yuyv, MP0_V); /* subtract uvbias from v_vec */ v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp0_rgba = y_vec + u_vec + v_vec */ mp0_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* ---------- process macropixel 1 --------------- */ /* take macropixel one ([YUYV]1) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp1_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]1 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP1_Y0); y1_vec = vdup_lane_u8(rawpixels, MP1_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_high_s16(widerpixels); u_vec = vdupq_lane_s16(yuyv, 1); u_vec = vsubq_s16(u_vec, uvbias); v_vec = vdupq_lane_s16(yuyv, 3); v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp1_rgba = y_vec + u_vec + v_vec */ mp1_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* turn mp0_rgba from a vector of shorts to a vector of */ /* unsigned unsigned chars. this will saturate: clipping */ /* the values between 0 and 255. */ rgba0 = vqmovun_s16(mp0_rgba); rgba1 = vqmovun_s16(mp1_rgba); /* make it faster to copy these back out of vector registers into */ /* memory by combining rgba0 and rgba1 into the larger bothrgba. */ /* then store that back into memory at destinationp. */ bothrgba = vcombine_u8(rgba0, rgba1); vst1q_u8(destinationp, bothrgba); destinationp += 16; } }