void vpx_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) { const uint8x8_t XABCD_u8 = vld1_u8(above - 1); const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); const uint32x2_t zero = vdup_n_u32(0); const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); const uint8_t D = vget_lane_u8(XABCD_u8, 4); const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); const uint32x2_t r3 = vreinterpret_u32_u8(avg2); const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); }
void test_vset_laneu8 (void) { uint8x8_t out_uint8x8_t; uint8_t arg0_uint8_t; uint8x8_t arg1_uint8x8_t; out_uint8x8_t = vset_lane_u8 (arg0_uint8_t, arg1_uint8x8_t, 1); }
uint8x8_t test_vset_lane_u8(uint8_t v1, uint8x8_t v2) { // CHECK: test_vset_lane_u8 return vset_lane_u8(v1, v2, 6); // CHECK: ins {{v[0-9]+}}.b[6], {{w[0-9]+}} }
void yuv422rgb_neon_int(const unsigned char * sourcep, int source_byte_count, unsigned char * destp) { const unsigned char *source_endp; const unsigned char *vector_endp; int remainder; const int16x8_t u_coeff = {0, -22, 113, 0, 0, -22, 113, 0}; const int16x8_t v_coeff = {90, -46, 0, 0, 90, -46, 0, 0}; const uint8x8_t zeroalpha = {0x0, 0x0, 0x0, 0xFF, 0x0, 0x0, 0x0, 0xFF}; const int16x8_t uvbias = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80}; int16x8_t mp0_rgba; /* macropixel 0's resulting RGBA RGBA pixels */ int16x8_t mp1_rgba; /* macropixel 1's resulting RGBA RGBA pixels */ uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1} */ uint8x8_t rgba0, rgba1; /* rgba values as bytes */ uint8x16_t bothrgba; uint8_t * destinationp; /* pointer into output buffer destp */ int16x8_t widerpixels; /* rawpixels promoted to shorts per component */ const uint8x8_t yselect = {0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; /* we're working with things in 4-byte macropixels */ remainder = source_byte_count % 4; source_endp = sourcep + source_byte_count; vector_endp = source_endp - remainder; destinationp = (uint8_t *)destp; while (sourcep < vector_endp) { /* pull YUYV from 2 four byte macropixels starting at sourcep. */ /* we'll increment sourcep as we go to save the array dereference */ /* and separate increment instruction at the end of the loop */ /* load rawpixels with {[YUYV]0 [YUYV]1 } with byte components */ rawpixels = vld1_u8(sourcep); sourcep += sizeof(rawpixels); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* ---------- process macropixel 0 --------------- */ /* take macropixel zero ([YUYV]0) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp0_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]0 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP0_Y0); y1_vec = vdup_lane_u8(rawpixels, MP0_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); /* store ALPHA in elements 3 and 7 (after the RGB components) */ narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ /* change interpretation from unsigned 16-bit to signed */ /* 16-bit. */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_low_s16(widerpixels); /* form a vector of the U component from MP0 */ u_vec = vdupq_lane_s16(yuyv, MP0_U); /* subtract uvbias from u_vec */ u_vec = vsubq_s16(u_vec, uvbias); /* form a vector of the V component from MP0 */ v_vec = vdupq_lane_s16(yuyv, MP0_V); /* subtract uvbias from v_vec */ v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp0_rgba = y_vec + u_vec + v_vec */ mp0_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* ---------- process macropixel 1 --------------- */ /* take macropixel one ([YUYV]1) from rawpixels and */ /* compute the two RGBA pixels that come from it. store */ /* those two pixels in mp1_rgba */ { int16x8_t wider_yalpha; int16x8_t u_vec, v_vec, uv_vec; uint8x8_t narrow_yalpha; uint8x8_t y0_vec, y1_vec; int16x4_t yuyv; /* narrow_yalpha is drawn from [YUYV]1 and formed into */ /* {Y0, Y0, Y0, alpha, Y1, Y1, Y1, alpha} */ /* this would have been a nice place for vtbx1_u8, but i */ /* can't get it to work. so i'll have to use vbsl_u8 instead. */ y0_vec = vdup_lane_u8(rawpixels, MP1_Y0); y1_vec = vdup_lane_u8(rawpixels, MP1_Y1); narrow_yalpha = vbsl_u8(yselect, y0_vec, y1_vec); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 3); narrow_yalpha = vset_lane_u8(ALPHA, narrow_yalpha, 7); /* use vmovl_u8 to go from being unsigned 8-bit to */ /* unsigned 16-bit, the use vreinterpretq_s16_u16 to */ wider_yalpha = vreinterpretq_s16_u16(vmovl_u8(narrow_yalpha)); yuyv = vget_high_s16(widerpixels); u_vec = vdupq_lane_s16(yuyv, 1); u_vec = vsubq_s16(u_vec, uvbias); v_vec = vdupq_lane_s16(yuyv, 3); v_vec = vsubq_s16(v_vec, uvbias); /* Multiply eight 16-bit values in u_vec by eight 16-bit */ /* values in u_coeff and store the results in u_vec. */ u_vec = vmulq_s16(u_vec, u_coeff); /* likewise multiply eight 16-bit values in v_vec by */ /* v_coeff and store the results in v_vec */ v_vec = vmulq_s16(v_vec, v_coeff); /* form uv_vec as the sum of u_vec & v_vec, then shift 6 places */ /* (dividing by 64) */ uv_vec = vaddq_s16(u_vec, v_vec); uv_vec = vshrq_n_s16(uv_vec, 6); /* now mp1_rgba = y_vec + u_vec + v_vec */ mp1_rgba = vaddq_s16(wider_yalpha, uv_vec); } /* turn mp0_rgba from a vector of shorts to a vector of */ /* unsigned unsigned chars. this will saturate: clipping */ /* the values between 0 and 255. */ rgba0 = vqmovun_s16(mp0_rgba); rgba1 = vqmovun_s16(mp1_rgba); /* make it faster to copy these back out of vector registers into */ /* memory by combining rgba0 and rgba1 into the larger bothrgba. */ /* then store that back into memory at destinationp. */ bothrgba = vcombine_u8(rgba0, rgba1); vst1q_u8(destinationp, bothrgba); destinationp += 16; } }
uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) { // CHECK-LABEL: test_vset_lane_u8: // CHECK-NEXT: ins.b v0[7], w0 // CHECK-NEXT: ret return vset_lane_u8(a, b, 7); }
// CHECK-LABEL: define <8 x i8> @test_vset_lane_u8(i8 %a, <8 x i8> %b) #0 { // CHECK: [[VSET_LANE:%.*]] = insertelement <8 x i8> %b, i8 %a, i32 7 // CHECK: ret <8 x i8> [[VSET_LANE]] uint8x8_t test_vset_lane_u8(uint8_t a, uint8x8_t b) { return vset_lane_u8(a, b, 7); }